From fdad95c055ad5274e3a0b053558fd76b1766243b Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Thu, 20 Mar 2014 20:34:51 +0100
Subject: [PATCH 1/5] PixelShaderGen: Cleanups and fixes for tev combiners.

Fixes issue 4674.
---
 Source/Core/VideoCommon/PixelShaderGen.cpp | 101 +++++++++++----------
 1 file changed, 53 insertions(+), 48 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index 29399a253f..961ca9f3b1 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -90,27 +90,6 @@ static const char *tevKSelTableA[] =
 	I_KCOLORS"[3].a", // K3_A = 0x1F
 };
 
-static const char *tevScaleTable[] =
-{
-	"",       // SCALE_1
-	" << 1",  // SCALE_2
-	" << 2",  // SCALE_4
-	" >> 1",  // DIVIDE_2
-};
-
-static const char *tevBiasTable[] =
-{
-	"",       // ZERO,
-	"+ 128",  // ADDHALF,
-	"- 128",  // SUBHALF,
-	"",
-};
-
-static const char *tevOpTable[] = {
-	"+",      // TEVOP_ADD = 0,
-	"-",      // TEVOP_SUB = 1,
-};
-
 static const char *tevCInputTable[] =
 {
 	"prev.rgb",          // CPREV,
@@ -343,7 +322,8 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T
 	          "\tint3 comp16 = int3(1, 256, 0), comp24 = int3(1, 256, 256*256);\n"
 	          "\tint alphabump=0;\n"
 	          "\tint3 tevcoord=int3(0, 0, 0);\n"
-	          "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n\n");
+	          "\tint2 wrappedcoord=int2(0,0), tempcoord=int2(0,0);\n"
+	          "\tint4 tevin_a=int4(0,0,0,0),tevin_b=int4(0,0,0,0),tevin_c=int4(0,0,0,0),tevin_d=int4(0,0,0,0);\n\n"); // tev combiner inputs
 
 	if (ApiType == API_OPENGL)
 	{
@@ -778,6 +758,33 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 	if (ac.dest >= GX_TEVREG0 && ac.dest <= GX_TEVREG2)
 		out.SetConstantsUsed(C_COLORS+ac.dest, C_COLORS+ac.dest);
 
+
+	const char *tevScaleTable[] =
+	{
+		"",       // SCALE_1
+		" << 1",  // SCALE_2
+		" << 2",  // SCALE_4
+		" >> 1",  // DIVIDE_2
+	};
+
+	const char *tevBiasTable[] =
+	{
+		"",       // ZERO,
+		"+ 128",  // ADDHALF,
+		"- 128",  // SUBHALF,
+		"",
+	};
+
+	const char *tevOpTable[] = {
+		"+",      // TEVOP_ADD = 0,
+		"-",      // TEVOP_SUB = 1,
+	};
+
+	out.Write("tevin_a = int4(%s, %s.a)&255;\n", tevCInputTable[cc.a], tevAInputTable[ac.a]);
+	out.Write("tevin_b = int4(%s, %s.a)&255;\n", tevCInputTable[cc.b], tevAInputTable[ac.b]);
+	out.Write("tevin_c = int4(%s, %s.a)&255;\n", tevCInputTable[cc.c], tevAInputTable[ac.c]);
+	out.Write("tevin_d = int4(%s, %s.a);\n", tevCInputTable[cc.d], tevAInputTable[ac.d]);
+
 	out.Write("\t// color combine\n");
 	out.Write("\t%s = clamp(", tevCOutputTable[cc.dest]);
 
@@ -789,9 +796,9 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 			out.Write("(");
 
 		if (!(cc.d == TEVCOLORARG_ZERO && cc.op == TEVOP_ADD))
-			out.Write("%s %s ", tevCInputTable[cc.d], tevOpTable[cc.op]);
+			out.Write("tevin_d.rgb %s ", tevOpTable[cc.op]);
 
-		out.Write("((%s&255) * (int3(255,255,255) - (%s&255)) + (%s&255) * (%s&255)) / 255", tevCInputTable[cc.a], tevCInputTable[cc.c], tevCInputTable[cc.b], tevCInputTable[cc.c]);
+		out.Write("(tevin_a.rgb * (int3(255,255,255) - tevin_c.rgb) + tevin_b.rgb * tevin_c.rgb) / 255");
 
 		out.Write(" %s", tevBiasTable[cc.bias]);
 
@@ -802,20 +809,19 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 	{
 		const char *function_table[] =
 		{
-			"(((%s.r&255) > %s.r) ? (%s&255): int3(0,0,0))", // TEVCMP_R8_GT
-			"(((%s.r&255) == %s.r) ? (%s&255): int3(0,0,0))", // TEVCMP_R8_EQ
-			"((idot((%s.rgb&255), comp16) >  idot((%s.rgb&255), comp16)) ? (%s&255): int3(0,0,0))", // TEVCMP_GR16_GT
-			"((idot((%s.rgb&255), comp16) == idot((%s.rgb&255), comp16)) ? (%s&255): int3(0,0,0))", // TEVCMP_GR16_EQ
-			"((idot((%s.rgb&255), comp24) >  idot((%s.rgb&255), comp24)) ? (%s&255): int3(0,0,0))", // TEVCMP_BGR24_GT
-			"((idot((%s.rgb&255), comp24) == idot((%s.rgb&255), comp24)) ? (%s&255): int3(0,0,0))", // TEVCMP_BGR24_EQ
-			"int3(max(sign(int3((%s.rgb&255)) - int3((%s.rgb&255))), int3(0,0,0)) * (%s&255))", // TEVCMP_RGB8_GT
-			"int3((int3(255,255,255) - max(sign(abs(int3((%s.rgb&255)) - int3((%s.rgb&255)))), int3(0,0,0))) * (%s&255))" // TEVCMP_RGB8_EQ
+			"((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_GT
+			"((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_EQ
+			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_GT
+			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_EQ
+			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_GT
+			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_EQ
+			"(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // TEVCMP_RGB8_GT
+			"((int3(255,255,255) - max(sign(abs(tevin_a.rgb - tevin_b.rgb))), int3(0,0,0))) * tevin_c.rgb)" // TEVCMP_RGB8_EQ
 		};
 
 		int mode = (cc.shift<<1)|cc.op;
-		out.Write("   %s + ", tevCInputTable[cc.d]);
-		out.Write(function_table[mode], tevCInputTable[cc.a],
-		          tevCInputTable[cc.b], tevCInputTable[cc.c]);
+		out.Write("   tevin_d.rgb + ");
+		out.Write(function_table[mode]);
 	}
 	if (cc.clamp)
 		out.Write(", int3(0,0,0), int3(255,255,255))");
@@ -833,9 +839,9 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 			out.Write("(");
 
 		if (!(ac.d == TEVALPHAARG_ZERO && ac.op == TEVOP_ADD))
-			out.Write("%s.a %s ", tevAInputTable[ac.d], tevOpTable[ac.op]);
+			out.Write("tevin_d.a %s ", tevOpTable[ac.op]);
 
-		out.Write("((%s.a&255) * (255 - (%s.a&255)) + (%s.a&255) * (%s.a&255)) / 255", tevAInputTable[ac.a], tevAInputTable[ac.c], tevAInputTable[ac.b], tevAInputTable[ac.c]);
+		out.Write("(tevin_a.a * (255 - tevin_c.a) + tevin_b.a * tevin_c.a) / 255");
 
 		out.Write(" %s",tevBiasTable[ac.bias]);
 
@@ -846,20 +852,19 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 	{
 		const char *function_table[] =
 		{
-			"(((%s.r&255) > (%s.r&255)) ? (%s.a&255) : 0)", // TEVCMP_R8_GT
-			"(((%s.r&255) == (%s.r&255)) ? (%s.a&255) : 0)", // TEVCMP_R8_EQ
-			"((idot((%s.rgb&255), comp16) >  idot((%s.rgb&255), comp16)) ? (%s.a&255) : 0)", // TEVCMP_GR16_GT
-			"((idot((%s.rgb&255), comp16) == idot((%s.rgb&255), comp16)) ? (%s.a&255) : 0)", // TEVCMP_GR16_EQ
-			"((idot((%s.rgb&255), comp24) >  idot((%s.rgb&255), comp24)) ? (%s.a&255) : 0)", // TEVCMP_BGR24_GT
-			"((idot((%s.rgb&255), comp24) == idot((%s.rgb&255), comp24)) ? (%s.a&255) : 0)", // TEVCMP_BGR24_EQ
-			"(((%s.a&255) >  (%s.a&255)) ? (%s.a&255) : 0)", // TEVCMP_A8_GT
-			"(((%s.a&255) == (%s.a&255)) ? (%s.a&255) : 0)" // TEVCMP_A8_EQ
+			"((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_GT
+			"((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_EQ
+			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_GT
+			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_EQ
+			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_GT
+			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_EQ
+			"((tevin_a.a >  tevin_b.a) ? tevin_c.a : 0)", // TEVCMP_A8_GT
+			"((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)" // TEVCMP_A8_EQ
 		};
 
 		int mode = (ac.shift<<1)|ac.op;
-		out.Write("   %s.a + ", tevAInputTable[ac.d]);
-		out.Write(function_table[mode], tevAInputTable[ac.a],
-		          tevAInputTable[ac.b], tevAInputTable[ac.c]);
+		out.Write("   tevin_d.a + ");
+		out.Write(function_table[mode]);
 	}
 	if (ac.clamp)
 		out.Write(", 0, 255)");

From 4f82d6f7aff107d3cb941df111250c66aa9f5ea8 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Thu, 20 Mar 2014 20:49:09 +0100
Subject: [PATCH 2/5] PixelShaderGen: Implement tev combiner lerping in a
 faster way which also reproduces hardware behavior perfectly.

The new behavior has been verified to be correct by hardware tests. This is an improvement over the old code, which was just a guess.
---
 Source/Core/VideoCommon/PixelShaderGen.cpp | 52 +++++++++-------------
 1 file changed, 22 insertions(+), 30 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index 961ca9f3b1..ff8b9c2f1c 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -759,19 +759,35 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 		out.SetConstantsUsed(C_COLORS+ac.dest, C_COLORS+ac.dest);
 
 
-	const char *tevScaleTable[] =
+	const char *tevScaleTableLeft[] =
 	{
 		"",       // SCALE_1
 		" << 1",  // SCALE_2
 		" << 2",  // SCALE_4
+		"",       // DIVIDE_2
+	};
+
+	const char *tevScaleTableRight[] =
+	{
+		"",       // SCALE_1
+		"",       // SCALE_2
+		"",       // SCALE_4
 		" >> 1",  // DIVIDE_2
 	};
 
+	const char *tevLerpBias[] = // indexed by 2*op+(shift==3)
+	{
+		"",
+		" + 128",
+		"",
+		" + 127",
+	};
+
 	const char *tevBiasTable[] =
 	{
-		"",       // ZERO,
-		"+ 128",  // ADDHALF,
-		"- 128",  // SUBHALF,
+		"",        // ZERO,
+		" + 128",  // ADDHALF,
+		" - 128",  // SUBHALF,
 		"",
 	};
 
@@ -791,19 +807,7 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 	// combine the color channel
 	if (cc.bias != TevBias_COMPARE) // if not compare
 	{
-		//normal color combiner goes here
-		if (cc.shift > TEVSCALE_1)
-			out.Write("(");
-
-		if (!(cc.d == TEVCOLORARG_ZERO && cc.op == TEVOP_ADD))
-			out.Write("tevin_d.rgb %s ", tevOpTable[cc.op]);
-
-		out.Write("(tevin_a.rgb * (int3(255,255,255) - tevin_c.rgb) + tevin_b.rgb * tevin_c.rgb) / 255");
-
-		out.Write(" %s", tevBiasTable[cc.bias]);
-
-		if (cc.shift > TEVSCALE_1)
-			out.Write(")%s", tevScaleTable[cc.shift]);
+		out.Write("(((tevin_d.rgb%s)%s) %s ((((tevin_a.rgb*256 + (tevin_b.rgb-tevin_a.rgb)*(tevin_c.rgb+(tevin_c.rgb>>7)))%s)%s)>>8))%s", tevBiasTable[cc.bias], tevScaleTableLeft[cc.shift], tevOpTable[cc.op], tevScaleTableLeft[cc.shift], tevLerpBias[2*cc.op+(cc.shift==3)], tevScaleTableRight[cc.shift]);
 	}
 	else
 	{
@@ -834,19 +838,7 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 
 	if (ac.bias != TevBias_COMPARE) // if not compare
 	{
-		//normal alpha combiner goes here
-		if (ac.shift > 0)
-			out.Write("(");
-
-		if (!(ac.d == TEVALPHAARG_ZERO && ac.op == TEVOP_ADD))
-			out.Write("tevin_d.a %s ", tevOpTable[ac.op]);
-
-		out.Write("(tevin_a.a * (255 - tevin_c.a) + tevin_b.a * tevin_c.a) / 255");
-
-		out.Write(" %s",tevBiasTable[ac.bias]);
-
-		if (ac.shift>0)
-			out.Write(")%s", tevScaleTable[ac.shift]);
+		out.Write("(((tevin_d.a%s)%s) %s ((((tevin_a.a*256 + (tevin_b.a-tevin_a.a)*(tevin_c.a+(tevin_c.a>>7)))%s)%s)>>8))%s", tevBiasTable[ac.bias], tevScaleTableLeft[ac.shift], tevOpTable[ac.op], tevScaleTableLeft[ac.shift], tevLerpBias[2*ac.op+(ac.shift==3)], tevScaleTableRight[ac.shift]);
 	}
 	else
 	{

From eb0f547a17c74bf03827c48c7b8fce6b7a303174 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Mon, 24 Mar 2014 14:41:56 +0100
Subject: [PATCH 3/5] PixelShaderGen: Cleanups.

---
 Source/Core/VideoCommon/PixelShaderGen.cpp | 166 +++++++++++----------
 1 file changed, 90 insertions(+), 76 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index ff8b9c2f1c..85d25b2cac 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -112,14 +112,14 @@ static const char *tevCInputTable[] =
 
 static const char *tevAInputTable[] =
 {
-	"prev",            // APREV,
-	"c0",              // A0,
-	"c1",              // A1,
-	"c2",              // A2,
-	"textemp",         // TEXA,
-	"rastemp",         // RASA,
-	"konsttemp",       // KONST,  (hw1 had quarter)
-	"int4(0,0,0,0)",   // ZERO
+	"prev.a",        // APREV,
+	"c0.a",          // A0,
+	"c1.a",          // A1,
+	"c2.a",          // A2,
+	"textemp.a",     // TEXA,
+	"rastemp.a",     // RASA,
+	"konsttemp.a",   // KONST,  (hw1 had quarter)
+	"0",             // ZERO
 };
 
 static const char *tevRasTable[] =
@@ -140,6 +140,7 @@ static const char *tevAOutputTable[]  = { "prev.a", "c0.a", "c1.a", "c2.a" };
 static char text[16384];
 
 template<class T> static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, API_TYPE ApiType, const char swapModeTable[4][5]);
+template<class T> static inline void WriteTevRegular(T& out, const char* components, int bias, int op, int clamp, int shift);
 template<class T> static inline void SampleTexture(T& out, const char *texcoords, const char *texswap, int texmap, API_TYPE ApiType);
 template<class T> static inline void WriteAlphaTest(T& out, pixel_shader_uid_data& uid_data, API_TYPE ApiType,DSTALPHA_MODE dstAlphaMode, bool per_pixel_depth);
 template<class T> static inline void WriteFog(T& out, pixel_shader_uid_data& uid_data);
@@ -759,6 +760,76 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 		out.SetConstantsUsed(C_COLORS+ac.dest, C_COLORS+ac.dest);
 
 
+	out.Write("tevin_a = int4(%s, %s)&255;\n", tevCInputTable[cc.a], tevAInputTable[ac.a]);
+	out.Write("tevin_b = int4(%s, %s)&255;\n", tevCInputTable[cc.b], tevAInputTable[ac.b]);
+	out.Write("tevin_c = int4(%s, %s)&255;\n", tevCInputTable[cc.c], tevAInputTable[ac.c]);
+	out.Write("tevin_d = int4(%s, %s);\n", tevCInputTable[cc.d], tevAInputTable[ac.d]);
+
+	out.Write("\t// color combine\n");
+	out.Write("\t%s = clamp(", tevCOutputTable[cc.dest]);
+	if (cc.bias != TevBias_COMPARE)
+	{
+		WriteTevRegular(out, "rgb", cc.bias, cc.op, cc.clamp, cc.shift);
+	}
+	else
+	{
+		const char *function_table[] =
+		{
+			"((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_GT
+			"((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_EQ
+			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_GT
+			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_EQ
+			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_GT
+			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_EQ
+			"(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // TEVCMP_RGB8_GT
+			"((int3(255,255,255) - max(sign(abs(tevin_a.rgb - tevin_b.rgb))), int3(0,0,0))) * tevin_c.rgb)" // TEVCMP_RGB8_EQ
+		};
+
+		int mode = (cc.shift<<1)|cc.op;
+		out.Write("   tevin_d.rgb + ");
+		out.Write(function_table[mode]);
+	}
+	if (cc.clamp)
+		out.Write(", int3(0,0,0), int3(255,255,255))");
+	else
+		out.Write(", int3(-1024,-1024,-1024), int3(1023,1023,1023))");
+	out.Write(";\n");
+
+	out.Write("\t// alpha combine\n");
+	out.Write("\t%s = clamp(", tevAOutputTable[ac.dest]);
+	if (ac.bias != TevBias_COMPARE)
+	{
+		WriteTevRegular(out, "a", ac.bias, ac.op, ac.clamp, ac.shift);
+	}
+	else
+	{
+		const char *function_table[] =
+		{
+			"((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_GT
+			"((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_EQ
+			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_GT
+			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_EQ
+			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_GT
+			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_EQ
+			"((tevin_a.a >  tevin_b.a) ? tevin_c.a : 0)", // TEVCMP_A8_GT
+			"((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)" // TEVCMP_A8_EQ
+		};
+
+		int mode = (ac.shift<<1)|ac.op;
+		out.Write("   tevin_d.a + ");
+		out.Write(function_table[mode]);
+	}
+	if (ac.clamp)
+		out.Write(", 0, 255)");
+	else
+		out.Write(", -1024, 1023)");
+
+	out.Write(";\n\n");
+}
+
+template<class T>
+static inline void WriteTevRegular(T& out, const char* components, int bias, int op, int clamp, int shift)
+{
 	const char *tevScaleTableLeft[] =
 	{
 		"",       // SCALE_1
@@ -796,74 +867,17 @@ static inline void WriteStage(T& out, pixel_shader_uid_data& uid_data, int n, AP
 		"-",      // TEVOP_SUB = 1,
 	};
 
-	out.Write("tevin_a = int4(%s, %s.a)&255;\n", tevCInputTable[cc.a], tevAInputTable[ac.a]);
-	out.Write("tevin_b = int4(%s, %s.a)&255;\n", tevCInputTable[cc.b], tevAInputTable[ac.b]);
-	out.Write("tevin_c = int4(%s, %s.a)&255;\n", tevCInputTable[cc.c], tevAInputTable[ac.c]);
-	out.Write("tevin_d = int4(%s, %s.a);\n", tevCInputTable[cc.d], tevAInputTable[ac.d]);
-
-	out.Write("\t// color combine\n");
-	out.Write("\t%s = clamp(", tevCOutputTable[cc.dest]);
-
-	// combine the color channel
-	if (cc.bias != TevBias_COMPARE) // if not compare
-	{
-		out.Write("(((tevin_d.rgb%s)%s) %s ((((tevin_a.rgb*256 + (tevin_b.rgb-tevin_a.rgb)*(tevin_c.rgb+(tevin_c.rgb>>7)))%s)%s)>>8))%s", tevBiasTable[cc.bias], tevScaleTableLeft[cc.shift], tevOpTable[cc.op], tevScaleTableLeft[cc.shift], tevLerpBias[2*cc.op+(cc.shift==3)], tevScaleTableRight[cc.shift]);
-	}
-	else
-	{
-		const char *function_table[] =
-		{
-			"((tevin_a.r > tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_GT
-			"((tevin_a.r == tevin_b.r) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_R8_EQ
-			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_GT
-			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_GR16_EQ
-			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_GT
-			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.rgb : int3(0,0,0))", // TEVCMP_BGR24_EQ
-			"(max(sign(tevin_a.rgb - tevin_b.rgb), int3(0,0,0)) * tevin_c.rgb)", // TEVCMP_RGB8_GT
-			"((int3(255,255,255) - max(sign(abs(tevin_a.rgb - tevin_b.rgb))), int3(0,0,0))) * tevin_c.rgb)" // TEVCMP_RGB8_EQ
-		};
-
-		int mode = (cc.shift<<1)|cc.op;
-		out.Write("   tevin_d.rgb + ");
-		out.Write(function_table[mode]);
-	}
-	if (cc.clamp)
-		out.Write(", int3(0,0,0), int3(255,255,255))");
-	else
-		out.Write(", int3(-1024,-1024,-1024), int3(1023,1023,1023))");
-	out.Write(";\n");
-
-	out.Write("\t// alpha combine\n");
-	out.Write("\t%s = clamp(", tevAOutputTable[ac.dest]);
-
-	if (ac.bias != TevBias_COMPARE) // if not compare
-	{
-		out.Write("(((tevin_d.a%s)%s) %s ((((tevin_a.a*256 + (tevin_b.a-tevin_a.a)*(tevin_c.a+(tevin_c.a>>7)))%s)%s)>>8))%s", tevBiasTable[ac.bias], tevScaleTableLeft[ac.shift], tevOpTable[ac.op], tevScaleTableLeft[ac.shift], tevLerpBias[2*ac.op+(ac.shift==3)], tevScaleTableRight[ac.shift]);
-	}
-	else
-	{
-		const char *function_table[] =
-		{
-			"((tevin_a.r > tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_GT
-			"((tevin_a.r == tevin_b.r) ? tevin_c.a : 0)", // TEVCMP_R8_EQ
-			"((idot(tevin_a.rgb, comp16) >  idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_GT
-			"((idot(tevin_a.rgb, comp16) == idot(tevin_b.rgb, comp16)) ? tevin_c.a : 0)", // TEVCMP_GR16_EQ
-			"((idot(tevin_a.rgb, comp24) >  idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_GT
-			"((idot(tevin_a.rgb, comp24) == idot(tevin_b.rgb, comp24)) ? tevin_c.a : 0)", // TEVCMP_BGR24_EQ
-			"((tevin_a.a >  tevin_b.a) ? tevin_c.a : 0)", // TEVCMP_A8_GT
-			"((tevin_a.a == tevin_b.a) ? tevin_c.a : 0)" // TEVCMP_A8_EQ
-		};
-
-		int mode = (ac.shift<<1)|ac.op;
-		out.Write("   tevin_d.a + ");
-		out.Write(function_table[mode]);
-	}
-	if (ac.clamp)
-		out.Write(", 0, 255)");
-	else
-		out.Write(", -1024, 1023)");
-
-	out.Write(";\n\n");
+	// Regular TEV stage: (d + bias + lerp(a,b,c)) * scale
+	// The GC/Wii GPU uses a very sophisticated algorithm for scale-lerping:
+	// - c is scaled from 0..255 to 0..256, which allows dividing the result by 256 instead of 255
+	// - if scale is bigger than one, it is moved inside the lerp calculation for increased accuracy
+	// - a rounding bias is added before dividing by 256
+	out.Write("(((tevin_d.%s%s)%s)", components, tevBiasTable[bias], tevScaleTableLeft[shift]);
+	out.Write(" %s ", tevOpTable[op]);
+	out.Write("((((tevin_a.%s*256 + (tevin_b.%s-tevin_a.%s)*(tevin_c.%s+(tevin_c.%s>>7)))%s)%s)>>8)",
+	          components, components, components, components, components,
+	          tevScaleTableLeft[shift], tevLerpBias[2*op+(shift==3)]);
+	out.Write(")%s", tevScaleTableRight[shift]);
 }
 
 template<class T>

From a8c8db8da7487b4bc3bcfb549c296a811c691042 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Mon, 24 Mar 2014 14:42:04 +0100
Subject: [PATCH 4/5] Software renderer: Use color combiner configuration for
 alpha combiner compare mode inputs.

As pointed out by dolphin-emu/hwtests@f684f2498.
---
 Source/Core/VideoBackends/Software/Tev.cpp | 270 ++++++++-------------
 Source/Core/VideoBackends/Software/Tev.h   |   8 +-
 2 files changed, 100 insertions(+), 178 deletions(-)

diff --git a/Source/Core/VideoBackends/Software/Tev.cpp b/Source/Core/VideoBackends/Software/Tev.cpp
index 1890aaa3e3..3c294ef5f6 100644
--- a/Source/Core/VideoBackends/Software/Tev.cpp
+++ b/Source/Core/VideoBackends/Software/Tev.cpp
@@ -55,14 +55,14 @@ void Tev::Init()
 	m_ColorInputLUT[14][RED_INP] = &StageKonst[RED_C]; m_ColorInputLUT[14][GRN_INP] = &StageKonst[GRN_C]; m_ColorInputLUT[14][BLU_INP] = &StageKonst[BLU_C]; // konst
 	m_ColorInputLUT[15][RED_INP] = &FixedConstants[0]; m_ColorInputLUT[15][GRN_INP] = &FixedConstants[0]; m_ColorInputLUT[15][BLU_INP] = &FixedConstants[0]; // zero
 
-	m_AlphaInputLUT[0] = Reg[0]; // prev
-	m_AlphaInputLUT[1] = Reg[1]; // c0
-	m_AlphaInputLUT[2] = Reg[2]; // c1
-	m_AlphaInputLUT[3] = Reg[3]; // c2
-	m_AlphaInputLUT[4] = TexColor; // tex
-	m_AlphaInputLUT[5] = RasColor; // ras
-	m_AlphaInputLUT[6] = StageKonst; // konst
-	m_AlphaInputLUT[7] = Zero16; // zero
+	m_AlphaInputLUT[0] = &Reg[0][ALP_C]; // prev
+	m_AlphaInputLUT[1] = &Reg[1][ALP_C]; // c0
+	m_AlphaInputLUT[2] = &Reg[2][ALP_C]; // c1
+	m_AlphaInputLUT[3] = &Reg[3][ALP_C]; // c2
+	m_AlphaInputLUT[4] = &TexColor[ALP_C]; // tex
+	m_AlphaInputLUT[5] = &RasColor[ALP_C]; // ras
+	m_AlphaInputLUT[6] = &StageKonst[ALP_C]; // konst
+	m_AlphaInputLUT[7] = &Zero16[ALP_C]; // zero
 
 	for (int comp = 0; comp < 4; comp++)
 	{
@@ -176,16 +176,11 @@ void Tev::SetRasColor(int colorChan, int swaptable)
 	}
 }
 
-void Tev::DrawColorRegular(TevStageCombiner::ColorCombiner &cc)
+void Tev::DrawColorRegular(TevStageCombiner::ColorCombiner &cc, const InputRegType inputs[4])
 {
-	InputRegType InputReg;
-
 	for (int i = 0; i < 3; i++)
 	{
-		InputReg.a = *m_ColorInputLUT[cc.a][i];
-		InputReg.b = *m_ColorInputLUT[cc.b][i];
-		InputReg.c = *m_ColorInputLUT[cc.c][i];
-		InputReg.d = *m_ColorInputLUT[cc.d][i];
+		const InputRegType& InputReg = inputs[BLU_C + i];
 
 		u16 c = InputReg.c + (InputReg.c >> 7);
 
@@ -200,120 +195,66 @@ void Tev::DrawColorRegular(TevStageCombiner::ColorCombiner &cc)
 	}
 }
 
-void Tev::DrawColorCompare(TevStageCombiner::ColorCombiner &cc)
+void Tev::DrawColorCompare(TevStageCombiner::ColorCombiner &cc, const InputRegType inputs[4])
 {
-	int cmp = (cc.shift<<1)|cc.op|8; // comparemode stored here
+	for (int i = BLU_C; i < RED_C; i++)
+	{
+		switch ((cc.shift<<1)|cc.op|8)  // encoded compare mode
+		{
+		case TEVCMP_R8_GT:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[RED_C].a > inputs[RED_C].b) ? inputs[i].c : 0);
+			break;
 
-	u32 a;
-	u32 b;
+		case TEVCMP_R8_EQ:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[RED_C].a == inputs[RED_C].b) ? inputs[i].c : 0);
+			break;
 
-	InputRegType InputReg;
+		case TEVCMP_GR16_GT:
+			{
+				u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a > b) ? inputs[i].c : 0);
+			}
+			break;
 
-	switch (cmp) {
-	case TEVCMP_R8_GT:
-		{
-			a = *m_ColorInputLUT[cc.a][RED_INP] & 0xff;
-			b = *m_ColorInputLUT[cc.b][RED_INP] & 0xff;
-			for (int i = 0; i < 3; i++)
+		case TEVCMP_GR16_EQ:
 			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a > b) ? InputReg.c : 0);
+				u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a == b) ? inputs[i].c : 0);
 			}
-		}
-		break;
+			break;
 
-	case TEVCMP_R8_EQ:
-		{
-			a = *m_ColorInputLUT[cc.a][RED_INP] & 0xff;
-			b = *m_ColorInputLUT[cc.b][RED_INP] & 0xff;
-			for (int i = 0; i < 3; i++)
+		case TEVCMP_BGR24_GT:
 			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a == b) ? InputReg.c : 0);
+				u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a > b) ? inputs[i].c : 0);
 			}
-		}
-		break;
-	case TEVCMP_GR16_GT:
-		{
-			a = ((*m_ColorInputLUT[cc.a][GRN_INP] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][GRN_INP] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
+			break;
+
+		case TEVCMP_BGR24_EQ:
 			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a > b) ? InputReg.c : 0);
+				u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+				u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+				Reg[cc.dest][i] = inputs[i].d + ((a == b) ? inputs[i].c : 0);
 			}
+			break;
+
+		case TEVCMP_RGB8_GT:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[i].a > inputs[i].b) ? inputs[i].c : 0);
+			break;
+
+		case TEVCMP_RGB8_EQ:
+			Reg[cc.dest][i] = inputs[i].d + ((inputs[i].a == inputs[i].b) ? inputs[i].c : 0);
+			break;
 		}
-		break;
-	case TEVCMP_GR16_EQ:
-		{
-			a = ((*m_ColorInputLUT[cc.a][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
-			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a == b) ? InputReg.c : 0);
-			}
-		}
-		break;
-	case TEVCMP_BGR24_GT:
-		{
-			a = ((*m_ColorInputLUT[cc.a][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.a][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.b][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
-			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a > b) ? InputReg.c : 0);
-			}
-		}
-		break;
-	case TEVCMP_BGR24_EQ:
-		{
-			a = ((*m_ColorInputLUT[cc.a][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.a][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.a][RED_INP] & 0xff);
-			b = ((*m_ColorInputLUT[cc.b][BLU_C] & 0xff) << 16) | ((*m_ColorInputLUT[cc.b][GRN_C] & 0xff) << 8) | (*m_ColorInputLUT[cc.b][RED_INP] & 0xff);
-			for (int i = 0; i < 3; i++)
-			{
-				InputReg.c = *m_ColorInputLUT[cc.c][i];
-				InputReg.d = *m_ColorInputLUT[cc.d][i];
-				Reg[cc.dest][BLU_C + i] = InputReg.d + ((a == b) ? InputReg.c : 0);
-			}
-		}
-		break;
-	case TEVCMP_RGB8_GT:
-		for (int i = 0; i < 3; i++)
-		{
-			InputReg.a = *m_ColorInputLUT[cc.a][i];
-			InputReg.b = *m_ColorInputLUT[cc.b][i];
-			InputReg.c = *m_ColorInputLUT[cc.c][i];
-			InputReg.d = *m_ColorInputLUT[cc.d][i];
-			Reg[cc.dest][BLU_C + i] = InputReg.d + ((InputReg.a > InputReg.b) ? InputReg.c : 0);
-		}
-		break;
-	case TEVCMP_RGB8_EQ:
-		for (int i = 0; i < 3; i++)
-		{
-			InputReg.a = *m_ColorInputLUT[cc.a][i];
-			InputReg.b = *m_ColorInputLUT[cc.b][i];
-			InputReg.c = *m_ColorInputLUT[cc.c][i];
-			InputReg.d = *m_ColorInputLUT[cc.d][i];
-			Reg[cc.dest][BLU_C + i] = InputReg.d + ((InputReg.a == InputReg.b) ? InputReg.c : 0);
-		}
-		break;
 	}
 }
 
-void Tev::DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac)
+void Tev::DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac, const InputRegType inputs[4])
 {
-	InputRegType InputReg;
-
-	InputReg.a = m_AlphaInputLUT[ac.a][ALP_C];
-	InputReg.b = m_AlphaInputLUT[ac.b][ALP_C];
-	InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-	InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
+	const InputRegType& InputReg = inputs[ALP_C];
 
 	u16 c = InputReg.c + (InputReg.c >> 7);
 
@@ -327,88 +268,56 @@ void Tev::DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac)
 	Reg[ac.dest][ALP_C] = result;
 }
 
-void Tev::DrawAlphaCompare(TevStageCombiner::AlphaCombiner &ac)
+void Tev::DrawAlphaCompare(TevStageCombiner::AlphaCombiner& ac, const InputRegType inputs[4])
 {
-	int cmp = (ac.shift<<1)|ac.op|8; // comparemode stored here
-
-	u32 a;
-	u32 b;
-
-	InputRegType InputReg;
-
-	switch (cmp) {
+	switch ((ac.shift<<1)|ac.op|8)  // encoded compare mode
+	{
 	case TEVCMP_R8_GT:
-		{
-			a = m_AlphaInputLUT[ac.a][RED_C] & 0xff;
-			b = m_AlphaInputLUT[ac.b][RED_C] & 0xff;
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a > b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[RED_C].a > inputs[RED_C].b) ? inputs[ALP_C].c : 0);
 		break;
 
 	case TEVCMP_R8_EQ:
-		{
-			a = m_AlphaInputLUT[ac.a][RED_C] & 0xff;
-			b = m_AlphaInputLUT[ac.b][RED_C] & 0xff;
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a == b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[RED_C].a == inputs[RED_C].b) ? inputs[ALP_C].c : 0);
 		break;
+
 	case TEVCMP_GR16_GT:
 		{
-			a = ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a > b) ? InputReg.c : 0);
+			u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a > b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_GR16_EQ:
 		{
-			a = ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a == b) ? InputReg.c : 0);
+			u32 a = (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a == b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_BGR24_GT:
 		{
-			a = ((m_AlphaInputLUT[ac.a][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a > b) ? InputReg.c : 0);
+			u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a > b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_BGR24_EQ:
 		{
-			a = ((m_AlphaInputLUT[ac.a][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.a][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.a][RED_C] & 0xff);
-			b = ((m_AlphaInputLUT[ac.b][BLU_C] & 0xff) << 16) | ((m_AlphaInputLUT[ac.b][GRN_C] & 0xff) << 8) | (m_AlphaInputLUT[ac.b][RED_C] & 0xff);
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((a == b) ? InputReg.c : 0);
+			u32 a = (inputs[BLU_C].a << 16) | (inputs[GRN_C].a << 8) | inputs[RED_C].a;
+			u32 b = (inputs[BLU_C].b << 16) | (inputs[GRN_C].b << 8) | inputs[RED_C].b;
+			Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((a == b) ? inputs[ALP_C].c : 0);
 		}
 		break;
+
 	case TEVCMP_A8_GT:
-		{
-			InputReg.a = m_AlphaInputLUT[ac.a][ALP_C];
-			InputReg.b = m_AlphaInputLUT[ac.b][ALP_C];
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((InputReg.a > InputReg.b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[ALP_C].a > inputs[ALP_C].b) ? inputs[ALP_C].c : 0);
 		break;
+
 	case TEVCMP_A8_EQ:
-		{
-			InputReg.a = m_AlphaInputLUT[ac.a][ALP_C];
-			InputReg.b = m_AlphaInputLUT[ac.b][ALP_C];
-			InputReg.c = m_AlphaInputLUT[ac.c][ALP_C];
-			InputReg.d = m_AlphaInputLUT[ac.d][ALP_C];
-			Reg[ac.dest][ALP_C] = InputReg.d + ((InputReg.a == InputReg.b) ? InputReg.c : 0);
-		}
+		Reg[ac.dest][ALP_C] = inputs[ALP_C].d + ((inputs[ALP_C].a == inputs[ALP_C].b) ? inputs[ALP_C].c : 0);
 		break;
 	}
 }
@@ -666,10 +575,23 @@ void Tev::Draw()
 		SetRasColor(order.getColorChan(stageOdd), ac.rswap * 2);
 
 		// combine inputs
+		InputRegType inputs[4];
+		for (int i = 0; i < 3; i++)
+		{
+			inputs[BLU_C + i].a = *m_ColorInputLUT[cc.a][i];
+			inputs[BLU_C + i].b = *m_ColorInputLUT[cc.b][i];
+			inputs[BLU_C + i].c = *m_ColorInputLUT[cc.c][i];
+			inputs[BLU_C + i].d = *m_ColorInputLUT[cc.d][i];
+		}
+		inputs[ALP_C].a = *m_AlphaInputLUT[ac.a];
+		inputs[ALP_C].b = *m_AlphaInputLUT[ac.b];
+		inputs[ALP_C].c = *m_AlphaInputLUT[ac.c];
+		inputs[ALP_C].d = *m_AlphaInputLUT[ac.d];
+
 		if (cc.bias != 3)
-			DrawColorRegular(cc);
+			DrawColorRegular(cc, inputs);
 		else
-			DrawColorCompare(cc);
+			DrawColorCompare(cc, inputs);
 
 		if (cc.clamp)
 		{
@@ -685,9 +607,9 @@ void Tev::Draw()
 		}
 
 		if (ac.bias != 3)
-			DrawAlphaRegular(ac);
+			DrawAlphaRegular(ac, inputs);
 		else
-			DrawAlphaCompare(ac);
+			DrawAlphaCompare(ac, inputs);
 
 		if (ac.clamp)
 			Reg[ac.dest][ALP_C] = Clamp255(Reg[ac.dest][ALP_C]);
diff --git a/Source/Core/VideoBackends/Software/Tev.h b/Source/Core/VideoBackends/Software/Tev.h
index 617dee842b..ecb5fde9f0 100644
--- a/Source/Core/VideoBackends/Software/Tev.h
+++ b/Source/Core/VideoBackends/Software/Tev.h
@@ -60,10 +60,10 @@ class Tev
 
 	void SetRasColor(int colorChan, int swaptable);
 
-	void DrawColorRegular(TevStageCombiner::ColorCombiner &cc);
-	void DrawColorCompare(TevStageCombiner::ColorCombiner &cc);
-	void DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac);
-	void DrawAlphaCompare(TevStageCombiner::AlphaCombiner &ac);
+	void DrawColorRegular(TevStageCombiner::ColorCombiner& cc, const InputRegType inputs[4]);
+	void DrawColorCompare(TevStageCombiner::ColorCombiner& cc, const InputRegType inputs[4]);
+	void DrawAlphaRegular(TevStageCombiner::AlphaCombiner& ac, const InputRegType inputs[4]);
+	void DrawAlphaCompare(TevStageCombiner::AlphaCombiner& ac, const InputRegType inputs[4]);
 
 	void Indirect(unsigned int stageNum, s32 s, s32 t);
 

From 1dead05cae47aeb9cb9e14a65a286e0f0de8dc95 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Mon, 24 Mar 2014 14:54:17 +0100
Subject: [PATCH 5/5] Software renderer: Properly calculate tev combiner
 output.

As pointed out by dolphin-emu/hwtests@461476112.
---
 Source/Core/VideoBackends/Software/Tev.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/Source/Core/VideoBackends/Software/Tev.cpp b/Source/Core/VideoBackends/Software/Tev.cpp
index 3c294ef5f6..8ee3715e03 100644
--- a/Source/Core/VideoBackends/Software/Tev.cpp
+++ b/Source/Core/VideoBackends/Software/Tev.cpp
@@ -185,10 +185,11 @@ void Tev::DrawColorRegular(TevStageCombiner::ColorCombiner &cc, const InputRegTy
 		u16 c = InputReg.c + (InputReg.c >> 7);
 
 		s32 temp = InputReg.a * (256 - c) + (InputReg.b * c);
-		temp = cc.op?(-temp >> 8):(temp >> 8);
+		temp <<= m_ScaleLShiftLUT[cc.shift];
+		temp += (cc.shift != 3) ? 0 : (cc.op == 1) ? 127 : 128;
+		temp = cc.op ? (-temp >> 8) : (temp >> 8);
 
-		s32 result = InputReg.d + temp + m_BiasLUT[cc.bias];
-		result = result << m_ScaleLShiftLUT[cc.shift];
+		s32 result = ((InputReg.d + m_BiasLUT[cc.bias]) << m_ScaleLShiftLUT[cc.shift]) + temp;
 		result = result >> m_ScaleRShiftLUT[cc.shift];
 
 		Reg[cc.dest][BLU_C + i] = result;
@@ -259,10 +260,11 @@ void Tev::DrawAlphaRegular(TevStageCombiner::AlphaCombiner &ac, const InputRegTy
 	u16 c = InputReg.c + (InputReg.c >> 7);
 
 	s32 temp = InputReg.a * (256 - c) + (InputReg.b * c);
-	temp = ac.op?(-temp >> 8):(temp >> 8);
+	temp <<= m_ScaleLShiftLUT[ac.shift];
+	temp += (ac.shift != 3) ? 0 : (ac.op == 1) ? 127 : 128;
+	temp = ac.op ? (-temp >> 8) : (temp >> 8);
 
-	s32 result = InputReg.d + temp + m_BiasLUT[ac.bias];
-	result = result << m_ScaleLShiftLUT[ac.shift];
+	s32 result = ((InputReg.d + m_BiasLUT[ac.bias]) << m_ScaleLShiftLUT[ac.shift]) + temp;
 	result = result >> m_ScaleRShiftLUT[ac.shift];
 
 	Reg[ac.dest][ALP_C] = result;