Refactor UV offset and limit code to be shared by all renderers.

- Calculate and apply offset for quads with flipped UVs earlier. - Reduce offsets applied in software renderer as resolution scaling increases (fixes line artefacts). - Calculate UV limits used for filtering earlier and pass these to the individual renderers.
2024-11-23 16:59:49 +00:00 · 2018-11-30 12:11:39 +00:00 · 2018-11-30 12:11:39 +00:00 · 1e0e774ad2
commit 1e0e774ad2
parent a081f34a52
11 changed files with 310 additions and 296 deletions
--- a/mednafen/psx/gpu.h
+++ b/mednafen/psx/gpu.h
@ -148,6 +148,11 @@ struct PS_GPU
   uint32 InQuad_clut;
   bool InQuad_invalidW;

+   // primitive UV offsets (used to correct flipped sprites)
+   uint16_t off_u, off_v;
+   // primitive UV limits (used to clamp texture sampling)
+   uint16_t min_u, min_v, max_u, max_v;
+
   line_point InPLine_PrevPoint;

   uint32 FBRW_X;
--- a/mednafen/psx/gpu_polygon.cpp
+++ b/mednafen/psx/gpu_polygon.cpp
@ -1,4 +1,5 @@
 #include <math.h>
+#include <algorithm>
 #include "libretro_cbs.h"

 #define COORD_FBS 12
@ -173,7 +174,7 @@ static INLINE void DrawSpan(PS_GPU *gpu, int y, const int32 x_start, const int32

   if(textured)
   {
-      uint16 fbw = GetTexel<TexMode_TA>(gpu, ig.u >> (COORD_FBS + COORD_POST_PADDING), ig.v >> (COORD_FBS + COORD_POST_PADDING));
+	   uint16 fbw = GetTexel<TexMode_TA>(gpu, ig.u >> (COORD_FBS + COORD_POST_PADDING), ig.v >> (COORD_FBS + COORD_POST_PADDING));

    if(fbw)
    {
@ -290,8 +291,8 @@ static INLINE void DrawTriangle(PS_GPU *gpu, tri_vertex *vertices)

 if(textured)
 {
-  ig.u = (COORD_MF_INT(vertices[core_vertex].u) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-  ig.v = (COORD_MF_INT(vertices[core_vertex].v) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
+  ig.u = (COORD_MF_INT(vertices[core_vertex].u) + (1 << (COORD_FBS - 1 - gpu->upscale_shift))) << COORD_POST_PADDING;
+  ig.v = (COORD_MF_INT(vertices[core_vertex].v) + (1 << (COORD_FBS - 1 - gpu->upscale_shift))) << COORD_POST_PADDING;

  if (gpu->upscale_shift > 0)
     {
@ -300,10 +301,10 @@ static INLINE void DrawTriangle(PS_GPU *gpu, tri_vertex *vertices)
        // triangles. Otherwise this could cause a small "shift" in
        // the texture coordinates when upscaling.

-        if (idl.du_dy == 0 && (int32_t)idl.du_dx > 0)
-           ig.u -= (1 << (COORD_FBS - 1 - gpu->upscale_shift));
-        if (idl.dv_dx == 0 && (int32_t)idl.dv_dy > 0)
-           ig.v -= (1 << (COORD_FBS - 1 - gpu->upscale_shift));
+		 if(gpu->off_u)
+			ig.u += (COORD_MF_INT(1) - (1 << (COORD_FBS - gpu->upscale_shift))) << COORD_POST_PADDING;
+		 if (gpu->off_v)
+			 ig.v += (COORD_MF_INT(1) - (1 << (COORD_FBS - gpu->upscale_shift))) << COORD_POST_PADDING;
     }
 }

@ -491,6 +492,189 @@ if(vertices[1].y == vertices[0].y)
 #endif
 }

+
+// Determine whether to offset UVs to account for difference in interpolation between PS1 and modern GPUs
+void Calc_UVOffsets(PS_GPU *gpu, tri_vertex *vertices, unsigned count)
+{
+	// iCB: Just borrowing this from \parallel-psx\renderer\renderer.cpp
+	uint16 off_u = 0;
+	uint16 off_v = 0;
+
+	// For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior.
+	// If U or V is decreasing in X or Y, and we use the provided U/V as is, we will sample the wrong texel as interpolation
+	// covers an entire pixel, while PSX samples its interpolation essentially in the top-left corner and splats that interpolant across the entire pixel.
+	// While we could emulate this reasonably well in native resolution by shifting our vertex coords by 0.5,
+	// this breaks in upscaling scenarios, because we have several samples per native sample and we need NN rules to hit the same UV every time.
+	// One approach here is to use interpolate at offset or similar tricks to generalize the PSX interpolation patterns,
+	// but the problem is that vertices sharing an edge will no longer see the same UV (due to different plane derivatives),
+	// we end up sampling outside the intended boundary and artifacts are inevitable, so the only case where we can apply this fixup is for "sprites"
+	// or similar which should not share edges, which leads to this unfortunate code below.
+	//
+	// Only apply this workaround for quads.
+	if (count == 4)
+	{
+		// It might be faster to do more direct checking here, but the code below handles primitives in any order
+		// and orientation, and is far more SIMD-friendly if needed.
+		float abx = vertices[1].precise[0] - vertices[0].precise[0];
+		float aby = vertices[1].precise[1] - vertices[0].precise[1];
+		float bcx = vertices[2].precise[0] - vertices[1].precise[0];
+		float bcy = vertices[2].precise[1] - vertices[1].precise[1];
+		float cax = vertices[0].precise[0] - vertices[2].precise[0];
+		float cay = vertices[0].precise[1] - vertices[2].precise[1];
+
+		// Compute static derivatives, just assume W is uniform across the primitive
+		// and that the plane equation remains the same across the quad.
+		float dudx = -aby * float(vertices[2].u) - bcy * float(vertices[0].u) - cay * float(vertices[1].u);
+		float dvdx = -aby * float(vertices[2].v) - bcy * float(vertices[0].v) - cay * float(vertices[1].v);
+		float dudy = +abx * float(vertices[2].u) + bcx * float(vertices[0].u) + cax * float(vertices[1].u);
+		float dvdy = +abx * float(vertices[2].v) + bcx * float(vertices[0].v) + cax * float(vertices[1].v);
+		float area = bcx * cay - bcy * cax;
+
+		// iCB: Detect and reject any triangles with 0 size texture area
+		float texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) - (vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
+
+		// Shouldn't matter as degenerate primitives will be culled anyways.
+		if ((area != 0.0f) && (texArea != 0.0f))
+		{
+			float inv_area = 1.0f / area;
+			dudx *= inv_area;
+			dudy *= inv_area;
+			dvdx *= inv_area;
+			dvdy *= inv_area;
+
+			bool neg_dudx = dudx < 0.0f;
+			bool neg_dudy = dudy < 0.0f;
+			bool neg_dvdx = dvdx < 0.0f;
+			bool neg_dvdy = dvdy < 0.0f;
+			bool zero_dudx = dudx == 0.0f;
+			bool zero_dudy = dudy == 0.0f;
+			bool zero_dvdx = dvdx == 0.0f;
+			bool zero_dvdy = dvdy == 0.0f;
+
+			// If we have negative dU or dV in any direction, increment the U or V to work properly with nearest-neighbor in this impl.
+			// If we don't have 1:1 pixel correspondence, this creates a slight "shift" in the sprite, but we guarantee that we don't sample garbage at least.
+			// Overall, this is kinda hacky because there can be legitimate, rare cases where 3D meshes hit this scenario, and a single texel offset can pop in, but
+			// this is way better than having borked 2D overall.
+			// TODO: Try to figure out if this can be generalized.
+			//
+			// TODO: If perf becomes an issue, we can probably SIMD the 8 comparisons above,
+			// create an 8-bit code, and use a LUT to get the offsets.
+			// Case 1: U is decreasing in X, but no change in Y.
+			// Case 2: U is decreasing in Y, but no change in X.
+			// Case 3: V is decreasing in X, but no change in Y.
+			// Case 4: V is decreasing in Y, but no change in X.
+			if (neg_dudx && zero_dudy)
+				off_u++;
+			else if (neg_dudy && zero_dudx)
+				off_u++;
+			if (neg_dvdx && zero_dvdy)
+				off_v++;
+			else if (neg_dvdy && zero_dvdx)
+				off_v++;
+		}
+	}
+
+	gpu->off_u = off_u;
+	gpu->off_v = off_v;
+}
+
+// Reset min/max UVs for primitive
+void Reset_UVLimits(PS_GPU *gpu)
+{
+	gpu->min_u = UINT16_MAX;
+	gpu->min_v = UINT16_MAX;
+	gpu->max_u = 0;
+	gpu->max_v = 0;
+}
+
+// Determine min and max UVs sampled for a given primitive
+void Extend_UVLimits(PS_GPU *gpu, tri_vertex *vertices, unsigned count)
+{
+	uint8 twx = gpu->SUCV.TWX_AND;
+	uint8 twy = gpu->SUCV.TWY_AND;
+
+	uint16 min_u = gpu->min_u;
+	uint16 min_v = gpu->min_v;
+	uint16 max_u = gpu->max_u;
+	uint16 max_v = gpu->max_v;
+
+	if ((twx == (uint8)0xffu) && (twy == (uint8)0xffu))
+	{
+		// If we're not using texture window, we're likely accessing a small subset of the texture.
+		for (unsigned int i = 0; i < count; i++)
+		{
+			min_u = std::min(min_u, uint16_t(vertices[i].u));
+			min_v = std::min(min_v, uint16_t(vertices[i].v));
+			max_u = std::max(max_u, uint16_t(vertices[i].u));
+			max_v = std::max(max_v, uint16_t(vertices[i].v));
+		}
+	}
+	else
+	{
+		// texture window so don't clamp texture
+		min_u = 0;
+		min_v = 0;
+		max_u = UINT16_MAX;
+		max_v = UINT16_MAX;
+	}
+
+	gpu->min_u = min_u;
+	gpu->min_v = min_v;
+	gpu->max_u = max_u;
+	gpu->max_v = max_v;
+}
+
+// Apply offsets to UV limits before returning
+void Finalise_UVLimits(PS_GPU *gpu)
+{
+	uint8 twx = gpu->SUCV.TWX_AND;
+	uint8 twy = gpu->SUCV.TWY_AND;
+
+	uint16 min_u = gpu->min_u;
+	uint16 min_v = gpu->min_v;
+	uint16 max_u = gpu->max_u;
+	uint16 max_v = gpu->max_v;
+
+	uint16 off_u = gpu->off_u;
+	uint16 off_v = gpu->off_v;
+
+	if ((twx == (uint8)0xffu) && (twy == (uint8)0xffu))
+	{
+		// offset output UV Limits
+		min_u += off_u;
+		min_v += off_v;
+		max_u += off_u;
+		max_v += off_v;
+
+		// In nearest neighbor, we'll get *very* close to this UV, but not close enough to actually sample it.
+		// If du/dx or dv/dx are negative, we probably need to invert this though ...
+		if (max_u > min_u)
+			max_u--;
+		if (max_v > min_v)
+			max_v--;
+
+		// If there's no wrapping, we can prewrap and avoid fallback.
+		if ((max_u & 0xff00) == (min_u & 0xff00))
+			max_u &= 0xff;
+		if ((max_v & 0xff00) == (min_v & 0xff00))
+			max_v &= 0xff;
+	}
+	else
+	{
+		// texture window so don't clamp texture
+		min_u = 0;
+		min_v = 0;
+		max_u = UINT16_MAX;
+		max_v = UINT16_MAX;
+	}
+
+	gpu->min_u = min_u;
+	gpu->min_v = min_v;
+	gpu->max_u = max_u;
+	gpu->max_v = max_v;
+}
+
+
 // 0 = disabled
 // 1 = enabled (default mode) 
 // 2 = enabled (aggressive mode)
@ -776,6 +960,10 @@ static void Command_DrawPolygon(PS_GPU *gpu, const uint32_t *cb)
      for (unsigned v = 0; v < 3; v++)
         vertices[v].precise[2] = 1.f;

+   // Calculated UV offsets (needed for hardware renderers and software with scaling)
+   // Do one time updates for primitive
+   if (textured && (gpu->InCmd != INCMD_QUAD))
+      Calc_UVOffsets(gpu, vertices, numvertices);

   if(numvertices == 4)
   {
@ -847,6 +1035,8 @@ static void Command_DrawPolygon(PS_GPU *gpu, const uint32_t *cb)

 		if (rsx_intf_is_type() == RSX_OPENGL || rsx_intf_is_type() == RSX_VULKAN)
 		{
+			Reset_UVLimits(gpu);
+
 			if (numvertices == 4)
 			{
 				if (gpu->InCmd == INCMD_NONE)
@ -854,6 +1044,10 @@ static void Command_DrawPolygon(PS_GPU *gpu, const uint32_t *cb)
 					// We have 4 quad vertices, we can push that at once
 					tri_vertex *first = &gpu->InQuad_F3Vertices[0];

+					Extend_UVLimits(gpu, first, 1);
+					Extend_UVLimits(gpu, vertices, 3);
+					Finalise_UVLimits(gpu);
+
 					rsx_intf_push_quad(first->precise[0],
 						first->precise[1],
 						first->precise[2],
@ -878,10 +1072,12 @@ static void Command_DrawPolygon(PS_GPU *gpu, const uint32_t *cb)
 						((uint32_t)vertices[2].r) |
 						((uint32_t)vertices[2].g << 8) |
 						((uint32_t)vertices[2].b << 16),
-						first->u, first->v,
-						vertices[0].u, vertices[0].v,
-						vertices[1].u, vertices[1].v,
-						vertices[2].u, vertices[2].v,
+						first->u + gpu->off_u, first->v + gpu->off_v,
+						vertices[0].u + gpu->off_u, vertices[0].v + gpu->off_v,
+						vertices[1].u + gpu->off_u, vertices[1].v + gpu->off_v,
+						vertices[2].u + gpu->off_u, vertices[2].v + gpu->off_v,
+						gpu->min_u, gpu->min_v,
+						gpu->max_u, gpu->max_v,
 						gpu->TexPageX, gpu->TexPageY,
 						clut_x, clut_y,
 						blend_mode,
@ -894,6 +1090,9 @@ static void Command_DrawPolygon(PS_GPU *gpu, const uint32_t *cb)
 			}
 			else
 			{
+				Extend_UVLimits(gpu, vertices, 3);
+				Finalise_UVLimits(gpu);
+
 				// Push a single triangle
 				rsx_intf_push_triangle(vertices[0].precise[0],
 					vertices[0].precise[1],
@ -916,6 +1115,8 @@ static void Command_DrawPolygon(PS_GPU *gpu, const uint32_t *cb)
 					vertices[0].u, vertices[0].v,
 					vertices[1].u, vertices[1].v,
 					vertices[2].u, vertices[2].v,
+					gpu->min_u, gpu->min_v,
+					gpu->max_u, gpu->max_v,
 					gpu->TexPageX, gpu->TexPageY,
 					clut_x, clut_y,
 					blend_mode,
--- a/mednafen/psx/gpu_sprite.cpp
+++ b/mednafen/psx/gpu_sprite.cpp
@ -36,8 +36,8 @@ static void DrawSprite(PS_GPU *gpu, int32_t x_arg, int32_t y_arg, int32_t w, int
      //else
      // u = (u + 1) & ~1;

-      if(FlipY)
-         v_inc = -1;
+	  if(FlipY)
+		  v_inc = -1;
   }

   if(x_start < gpu->ClipX0)
@ -212,6 +212,10 @@ static void Command_DrawSprite(PS_GPU *gpu, const uint32_t *cb)
                           v + h,            /* t2y */
                           u + w,            /* t5x */
                           v + h,            /* t5y */
+						   u,
+		                   v,
+		                   u + w - 1,	// clamp UVs 1 pixel from edge (sampling should not quite reach it)
+		                   v + h - 1,
                           gpu->TexPageX,
                           gpu->TexPageY,
                           clut_x,
--- a/parallel-psx/renderer/renderer.cpp
+++ b/parallel-psx/renderer/renderer.cpp
@ -770,12 +770,7 @@ float Renderer::allocate_depth(const Rect &rect)

 void Renderer::build_attribs(BufferVertex *output, const Vertex *vertices, unsigned count)
 {
-	unsigned min_u = UINT16_MAX;
-   unsigned max_u = 0;
-   unsigned min_v = UINT16_MAX;
-   unsigned max_v = 0;

-   unsigned texture_limits[4];
 	
 	unsigned shift;
 	switch (render_state.texture_mode)
@ -791,119 +786,16 @@ void Renderer::build_attribs(BufferVertex *output, const Vertex *vertices, unsig
 		break;
 	}

-	uint16_t off_u = 0;
-	uint16_t off_v = 0;

 	if (render_state.texture_mode != TextureMode::None)
 	{
-		// For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior.
-		// If U or V is decreasing in X or Y, and we use the provided U/V as is, we will sample the wrong texel as interpolation
-		// covers an entire pixel, while PSX samples its interpolation essentially in the top-left corner and splats that interpolant across the entire pixel.
-		// While we could emulate this reasonably well in native resolution by shifting our vertex coords by 0.5,
-		// this breaks in upscaling scenarios, because we have several samples per native sample and we need NN rules to hit the same UV every time.
-		// One approach here is to use interpolate at offset or similar tricks to generalize the PSX interpolation patterns,
-		// but the problem is that vertices sharing an edge will no longer see the same UV (due to different plane derivatives),
-		// we end up sampling outside the intended boundary and artifacts are inevitable, so the only case where we can apply this fixup is for "sprites"
-		// or similar which should not share edges, which leads to this unfortunate code below.
-		//
-		// Only apply this workaround for quads.
-		if (count == 4)
-		{
-			// It might be faster to do more direct checking here, but the code below handles primitives in any order
-			// and orientation, and is far more SIMD-friendly if needed.
-			float abx = vertices[1].x - vertices[0].x;
-			float aby = vertices[1].y - vertices[0].y;
-			float bcx = vertices[2].x - vertices[1].x;
-			float bcy = vertices[2].y - vertices[1].y;
-			float cax = vertices[0].x - vertices[2].x;
-			float cay = vertices[0].y - vertices[2].y;
-
-			// Compute static derivatives, just assume W is uniform across the primitive
-			// and that the plane equation remains the same across the quad.
-			float dudx = -aby * float(vertices[2].u) - bcy * float(vertices[0].u) - cay * float(vertices[1].u);
-			float dvdx = -aby * float(vertices[2].v) - bcy * float(vertices[0].v) - cay * float(vertices[1].v);
-			float dudy = +abx * float(vertices[2].u) + bcx * float(vertices[0].u) + cax * float(vertices[1].u);
-			float dvdy = +abx * float(vertices[2].v) + bcx * float(vertices[0].v) + cax * float(vertices[1].v);
-			float area = bcx * cay - bcy * cax;
-			
-			// iCB: Detect and reject any triangles with 0 size texture area
-			float texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) - (vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
-			
-			// Shouldn't matter as degenerate primitives will be culled anyways.
-			if ((area != 0.0f) && (texArea != 0.0f))
-			{
-				float inv_area = 1.0f / area;
-				dudx *= inv_area;
-				dudy *= inv_area;
-				dvdx *= inv_area;
-				dvdy *= inv_area;
-
-				bool neg_dudx = dudx < 0.0f;
-				bool neg_dudy = dudy < 0.0f;
-				bool neg_dvdx = dvdx < 0.0f;
-				bool neg_dvdy = dvdy < 0.0f;
-				bool zero_dudx = dudx == 0.0f;
-				bool zero_dudy = dudy == 0.0f;
-				bool zero_dvdx = dvdx == 0.0f;
-				bool zero_dvdy = dvdy == 0.0f;
-
-				// If we have negative dU or dV in any direction, increment the U or V to work properly with nearest-neighbor in this impl.
-				// If we don't have 1:1 pixel correspondence, this creates a slight "shift" in the sprite, but we guarantee that we don't sample garbage at least.
-				// Overall, this is kinda hacky because there can be legitimate, rare cases where 3D meshes hit this scenario, and a single texel offset can pop in, but
-				// this is way better than having borked 2D overall.
-				// TODO: Try to figure out if this can be generalized.
-				//
-				// TODO: If perf becomes an issue, we can probably SIMD the 8 comparisons above,
-				// create an 8-bit code, and use a LUT to get the offsets.
-				// Case 1: U is decreasing in X, but no change in Y.
-				// Case 2: U is decreasing in Y, but no change in X.
-				// Case 3: V is decreasing in X, but no change in Y.
-				// Case 4: V is decreasing in Y, but no change in X.
-				if (neg_dudx && zero_dudy)
-					off_u++;
-				else if (neg_dudy && zero_dudx)
-					off_u++;
-				if (neg_dvdx && zero_dvdy)
-					off_v++;
-				else if (neg_dvdy && zero_dvdx)
-					off_v++;
-			}
-		}
 		
 		if (render_state.texture_window.mask_x == 0xffu && render_state.texture_window.mask_y == 0xffu)
 		{
-			// If we're not using texture window, we're likely accessing a small subset of the texture.
-			for (unsigned i = 0; i < count; i++)
-			{
-				min_u = min<unsigned>(min_u, vertices[i].u);
-				max_u = max<unsigned>(max_u, vertices[i].u);
-				min_v = min<unsigned>(min_v, vertices[i].v);
-				max_v = max<unsigned>(max_v, vertices[i].v);
-			}
-
-			min_u += off_u;
-			max_u += off_u;
-			min_v += off_v;
-			max_v += off_v;
-
-			// In nearest neighbor, we'll get *very* close to this UV, but not close enough to actually sample it.
-			// If du/dx or dv/dx are negative, we probably need to invert this though ...
-			if (max_u > min_u)
-				max_u--;
-			if (max_v > min_v)
-				max_v--;
-
-			// If there's no wrapping, we can prewrap and avoid fallback.
-			if ((max_u & 0xff00) == (min_u & 0xff00))
-				max_u &= 0xff;
-			if ((max_v & 0xff00) == (min_v & 0xff00))
-				max_v &= 0xff;
-				
-			texture_limits[0] = min_u;
-			texture_limits[1] = min_v;
-			texture_limits[2] = max_u;
-			texture_limits[3] = max_v;
-
+			unsigned min_u = render_state.UVLimits.min_u;
+			unsigned min_v = render_state.UVLimits.min_v;
+			unsigned max_u = render_state.UVLimits.max_u;
+			unsigned max_v = render_state.UVLimits.max_v;
 			unsigned width = max_u - min_u + 1;
 			unsigned height = max_v - min_v + 1;

@ -968,8 +860,8 @@ void Renderer::build_attribs(BufferVertex *output, const Vertex *vertices, unsig
 			int16_t(render_state.palette_offset_x),
 			int16_t(render_state.palette_offset_y),
 			int16_t(shift | (render_state.dither << 8)),
-			int16_t(vertices[i].u + off_u),
-			int16_t(vertices[i].v + off_v),
+			int16_t(vertices[i].u),
+			int16_t(vertices[i].v),
 			int16_t(render_state.texture_offset_x),
 			int16_t(render_state.texture_offset_y),
 		};
@ -979,10 +871,10 @@ void Renderer::build_attribs(BufferVertex *output, const Vertex *vertices, unsig

 		output[i].color |= render_state.force_mask_bit ? 0xff000000u : 0u;
 		
-		output[i].min_u = float(texture_limits[0]);
-		output[i].min_v = float(texture_limits[1]);
-		output[i].max_u = float(texture_limits[2]);
-		output[i].max_v = float(texture_limits[3]);
+		output[i].min_u = float(render_state.UVLimits.min_u);
+		output[i].min_v = float(render_state.UVLimits.min_v);
+		output[i].max_u = float(render_state.UVLimits.max_u);
+		output[i].max_v = float(render_state.UVLimits.max_v);
 	}
 }

--- a/parallel-psx/renderer/renderer.hpp
+++ b/parallel-psx/renderer/renderer.hpp
@ -25,6 +25,11 @@ struct TextureWindow
 	uint8_t mask_x, mask_y, or_x, or_y;
 };

+struct UVRect
+{
+	uint16_t min_u, min_v, max_u, max_v;
+};
+
 enum class SemiTransparentMode
 {
 	None,
@ -59,6 +64,8 @@ public:
 		bool bpp24 = false;
 		bool dither = false;
 		bool adaptive_smoothing = true;
+
+		UVRect UVLimits;
 	};

 	struct SaveState
@ -162,6 +169,14 @@ public:
 		render_state.texture_color_modulate = enable;
 	}

+	inline void set_UV_limits(uint16_t min_u, uint16_t min_v, uint16_t max_u, uint16_t max_v)
+	{
+		render_state.UVLimits.min_u = min_u;
+		render_state.UVLimits.min_v = min_v;
+		render_state.UVLimits.max_u = max_u;
+		render_state.UVLimits.max_v = max_v;
+	}
+
 	// Draw commands
 	void clear_rect(const Rect &rect, FBColor color);
 	void draw_line(const Vertex *vertices);
--- a/rsx/rsx_intf.cpp
+++ b/rsx/rsx_intf.cpp
@ -357,6 +357,8 @@ void rsx_intf_push_triangle(
      uint16_t t0x, uint16_t t0y,
      uint16_t t1x, uint16_t t1y,
      uint16_t t2x, uint16_t t2y,
+	  uint16_t min_u, uint16_t min_v,
+	  uint16_t max_u, uint16_t max_v,
      uint16_t texpage_x, uint16_t texpage_y,
      uint16_t clut_x, uint16_t clut_y,
      uint8_t texture_blend_mode,
@ -387,6 +389,7 @@ void rsx_intf_push_triangle(
 #if defined(HAVE_OPENGL) || defined(HAVE_OPENGLES)
         rsx_gl_push_triangle(p0x, p0y, p0w, p1x, p1y, p1w, p2x, p2y, p2w,
               c0, c1, c2, t0x, t0y, t1x, t1y, t2x, t2y,
+			   min_u, min_v, max_u, max_v,
               texpage_x, texpage_y, clut_x, clut_y,
               texture_blend_mode,
               depth_shift,
@ -398,6 +401,7 @@ void rsx_intf_push_triangle(
 #if defined(HAVE_VULKAN)
         rsx_vulkan_push_triangle(p0x, p0y, p0w, p1x, p1y, p1w, p2x, p2y, p2w,
               c0, c1, c2, t0x, t0y, t1x, t1y, t2x, t2y,
+			   min_u, min_v, max_u, max_v,
               texpage_x, texpage_y, clut_x, clut_y,
               texture_blend_mode,
               depth_shift,
@ -418,6 +422,8 @@ void rsx_intf_push_quad(
 	uint16_t t1x, uint16_t t1y,
 	uint16_t t2x, uint16_t t2y,
 	uint16_t t3x, uint16_t t3y,
+	uint16_t min_u, uint16_t min_v,
+	uint16_t max_u, uint16_t max_v,
 	uint16_t texpage_x, uint16_t texpage_y,
 	uint16_t clut_x, uint16_t clut_y,
 	uint8_t texture_blend_mode,
@ -450,6 +456,7 @@ void rsx_intf_push_quad(
 		rsx_gl_push_quad(p0x, p0y, p0w, p1x, p1y, p1w, p2x, p2y, p2w, p3x, p3y, p3w,
 			c0, c1, c2, c3,
 			t0x, t0y, t1x, t1y, t2x, t2y, t3x, t3y,
+			min_u, min_v, max_u, max_v,
 			texpage_x, texpage_y, clut_x, clut_y,
 			texture_blend_mode,
 			depth_shift,
@ -462,6 +469,7 @@ void rsx_intf_push_quad(
 		rsx_vulkan_push_quad(p0x, p0y, p0w, p1x, p1y, p1w, p2x, p2y, p2w, p3x, p3y, p3w,
 			c0, c1, c2, c3,
 			t0x, t0y, t1x, t1y, t2x, t2y, t3x, t3y,
+			min_u, min_v, max_u, max_v,
 			texpage_x, texpage_y, clut_x, clut_y,
 			texture_blend_mode,
 			depth_shift,
--- a/rsx/rsx_intf.h
+++ b/rsx/rsx_intf.h
@ -55,6 +55,8 @@ void rsx_intf_push_triangle(float p0x, float p0y, float p0w,
                            uint32_t c0, uint32_t c1, uint32_t c2,
                            uint16_t t0x, uint16_t t0y,
                            uint16_t t1x, uint16_t t1y,
+	                        uint16_t min_u, uint16_t min_v,
+	                        uint16_t max_u, uint16_t max_v,
                            uint16_t t2x, uint16_t t2y,
                            uint16_t texpage_x, uint16_t texpage_y,
                            uint16_t clut_x, uint16_t clut_y,
@ -80,6 +82,8 @@ void rsx_intf_push_quad(float p0x, float p0y, float p0w,
                        uint16_t t1x, uint16_t t1y,
                        uint16_t t2x, uint16_t t2y,
                        uint16_t t3x, uint16_t t3y,
+	                    uint16_t min_u, uint16_t min_v,
+	                    uint16_t max_u, uint16_t max_v,
                        uint16_t texpage_x, uint16_t texpage_y,
                        uint16_t clut_x, uint16_t clut_y,
                        uint8_t texture_blend_mode,
--- a/rsx/rsx_lib_gl.cpp
+++ b/rsx/rsx_lib_gl.cpp
@ -163,10 +163,11 @@ struct CommandVertex {
   uint8_t dither;
   /* 0: primitive is opaque, 1: primitive is semi-transparent */
   uint8_t semi_transparent;
-   /* Texture window mask/OR values */
-   uint8_t texture_window[4];
   /* Texture limits of primtive */
   uint16_t texture_limits[4];
+   /* Texture window mask/OR values */
+   uint8_t texture_window[4];
+

   static std::vector<Attribute> attributes();
 };
@ -1308,11 +1309,11 @@ static bool GlRenderer_new(GlRenderer *renderer, DrawConfig config)
   if (dither_mode == DITHER_OFF)
   {
      /* Dithering is superfluous when we increase the internal
-       * color depth, but users asked for it */
-      DrawBuffer_disable_attribute(command_buffer, "dither");
+      * color depth, but users asked for it */
+	   DrawBuffer_disable_attribute(command_buffer, "dither");
   } else
   {
-      DrawBuffer_enable_attribute(command_buffer, "dither");
+	   DrawBuffer_enable_attribute(command_buffer, "dither");
   }

   GLenum command_draw_mode = wireframe ? GL_LINE : GL_FILL;
@ -1558,22 +1559,22 @@ static bool retro_refresh_variables(GlRenderer *renderer)
   dither_mode dither_mode = DITHER_NATIVE;
   if (environ_cb(RETRO_ENVIRONMENT_GET_VARIABLE, &var) && var.value)
   {
-      if (!strcmp(var.value, "1x(native)"))
-      {
-         dither_mode = DITHER_NATIVE;
-         DrawBuffer_enable_attribute(renderer->command_buffer, "dither");
-      }
+	   if (!strcmp(var.value, "1x(native)"))
+	   {
+		   dither_mode = DITHER_NATIVE;
+		   DrawBuffer_enable_attribute(renderer->command_buffer, "dither");
+	   }

-      else if (!strcmp(var.value, "internal resolution"))
-      {
-         dither_mode = DITHER_UPSCALED;
-         DrawBuffer_enable_attribute(renderer->command_buffer, "dither");
-      }
-      else if (!strcmp(var.value, "disabled"))
-      {
-         dither_mode  = DITHER_OFF;
-         DrawBuffer_disable_attribute(renderer->command_buffer, "dither");
-      }
+	   else if (!strcmp(var.value, "internal resolution"))
+	   {
+		   dither_mode = DITHER_UPSCALED;
+		   DrawBuffer_enable_attribute(renderer->command_buffer, "dither");
+	   }
+	   else if (!strcmp(var.value, "disabled"))
+	   {
+		   dither_mode  = DITHER_OFF;
+		   DrawBuffer_disable_attribute(renderer->command_buffer, "dither");
+	   }
   }

   var.key = BEETLE_OPT(wireframe);
@ -1590,7 +1591,7 @@ static bool retro_refresh_variables(GlRenderer *renderer)

   if (rebuild_fb_out)
   {
-      if (dither_mode == DITHER_OFF)
+	  if (dither_mode == DITHER_OFF)
         DrawBuffer_disable_attribute(renderer->command_buffer, "dither");
      else
         DrawBuffer_enable_attribute(renderer->command_buffer, "dither");
@ -1664,147 +1665,7 @@ static bool retro_refresh_variables(GlRenderer *renderer)
   return reconfigure_frontend;
 }

-static void texCoord_preprocessing(
-		GlRenderer *renderer,
-		CommandVertex *vertices,
-		unsigned count)
-{
-	// iCB: Just borrowing this from \parallel-psx\renderer\renderer.cpp
-	uint16_t min_u = UINT16_MAX;
-	uint16_t max_u = 0;
-	uint16_t min_v = UINT16_MAX;
-	uint16_t max_v = 0;

-	uint16_t off_u = 0;
-	uint16_t off_v = 0;
-
-	if (vertices[0].texture_blend_mode != 0)
-	{
-		// For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior.
-		// If U or V is decreasing in X or Y, and we use the provided U/V as is, we will sample the wrong texel as interpolation
-		// covers an entire pixel, while PSX samples its interpolation essentially in the top-left corner and splats that interpolant across the entire pixel.
-		// While we could emulate this reasonably well in native resolution by shifting our vertex coords by 0.5,
-		// this breaks in upscaling scenarios, because we have several samples per native sample and we need NN rules to hit the same UV every time.
-		// One approach here is to use interpolate at offset or similar tricks to generalize the PSX interpolation patterns,
-		// but the problem is that vertices sharing an edge will no longer see the same UV (due to different plane derivatives),
-		// we end up sampling outside the intended boundary and artifacts are inevitable, so the only case where we can apply this fixup is for "sprites"
-		// or similar which should not share edges, which leads to this unfortunate code below.
-		//
-		// Only apply this workaround for quads.
-		if (count == 4)
-		{
-			// It might be faster to do more direct checking here, but the code below handles primitives in any order
-			// and orientation, and is far more SIMD-friendly if needed.
-			float abx = vertices[1].position[0] - vertices[0].position[0];
-			float aby = vertices[1].position[1] - vertices[0].position[1];
-			float bcx = vertices[2].position[0] - vertices[1].position[0];
-			float bcy = vertices[2].position[1] - vertices[1].position[1];
-			float cax = vertices[0].position[0] - vertices[2].position[0];
-			float cay = vertices[0].position[1] - vertices[2].position[1];
-
-			// Compute static derivatives, just assume W is uniform across the primitive
-			// and that the plane equation remains the same across the quad.
-			float dudx = -aby * float(vertices[2].texture_coord[0]) - bcy * float(vertices[0].texture_coord[0]) - cay * float(vertices[1].texture_coord[0]);
-			float dvdx = -aby * float(vertices[2].texture_coord[1]) - bcy * float(vertices[0].texture_coord[1]) - cay * float(vertices[1].texture_coord[1]);
-			float dudy = +abx * float(vertices[2].texture_coord[0]) + bcx * float(vertices[0].texture_coord[0]) + cax * float(vertices[1].texture_coord[0]);
-			float dvdy = +abx * float(vertices[2].texture_coord[1]) + bcx * float(vertices[0].texture_coord[1]) + cax * float(vertices[1].texture_coord[1]);
-			float area = bcx * cay - bcy * cax;
-
-			// iCB: Detect and reject any triangles with 0 size texture area
-			float texArea = (vertices[1].texture_coord[0] - vertices[0].texture_coord[0]) * (vertices[2].texture_coord[1] - vertices[0].texture_coord[1]) - (vertices[2].texture_coord[0] - vertices[0].texture_coord[0]) * (vertices[1].texture_coord[1] - vertices[0].texture_coord[1]);
-
-			// Shouldn't matter as degenerate primitives will be culled anyways.
-			if ((area != 0.0f) && (texArea != 0.0f))
-			{
-				float inv_area = 1.0f / area;
-				dudx *= inv_area;
-				dudy *= inv_area;
-				dvdx *= inv_area;
-				dvdy *= inv_area;
-
-				bool neg_dudx = dudx < 0.0f;
-				bool neg_dudy = dudy < 0.0f;
-				bool neg_dvdx = dvdx < 0.0f;
-				bool neg_dvdy = dvdy < 0.0f;
-				bool zero_dudx = dudx == 0.0f;
-				bool zero_dudy = dudy == 0.0f;
-				bool zero_dvdx = dvdx == 0.0f;
-				bool zero_dvdy = dvdy == 0.0f;
-
-				// If we have negative dU or dV in any direction, increment the U or V to work properly with nearest-neighbor in this impl.
-				// If we don't have 1:1 pixel correspondence, this creates a slight "shift" in the sprite, but we guarantee that we don't sample garbage at least.
-				// Overall, this is kinda hacky because there can be legitimate, rare cases where 3D meshes hit this scenario, and a single texel offset can pop in, but
-				// this is way better than having borked 2D overall.
-				// TODO: Try to figure out if this can be generalized.
-				//
-				// TODO: If perf becomes an issue, we can probably SIMD the 8 comparisons above,
-				// create an 8-bit code, and use a LUT to get the offsets.
-				// Case 1: U is decreasing in X, but no change in Y.
-				// Case 2: U is decreasing in Y, but no change in X.
-				// Case 3: V is decreasing in X, but no change in Y.
-				// Case 4: V is decreasing in Y, but no change in X.
-				if (neg_dudx && zero_dudy)
-					off_u++;
-				else if (neg_dudy && zero_dudx)
-					off_u++;
-				if (neg_dvdx && zero_dvdy)
-					off_v++;
-				else if (neg_dvdy && zero_dvdx)
-					off_v++;
-			}
-		}
-
-		if (renderer->tex_x_mask == 0xffu && renderer->tex_y_mask == 0xffu)
-		{
-			// If we're not using texture window, we're likely accessing a small subset of the texture.
-			for (unsigned i = 0; i < count; i++)
-			{
-				min_u = std::min(min_u, vertices[i].texture_coord[0]);
-				max_u = std::max(max_u, vertices[i].texture_coord[0]);
-				min_v = std::min(min_v, vertices[i].texture_coord[1]);
-				max_v = std::max(max_v, vertices[i].texture_coord[1]);
-			}
-
-			min_u += off_u;
-			max_u += off_u;
-			min_v += off_v;
-			max_v += off_v;
-
-			// In nearest neighbor, we'll get *very* close to this UV, but not close enough to actually sample it.
-			// If du/dx or dv/dx are negative, we probably need to invert this though ...
-			if (max_u > min_u)
-				max_u--;
-			if (max_v > min_v)
-				max_v--;
-
-			// If there's no wrapping, we can prewrap and avoid fallback.
-			if ((max_u & 0xff00) == (min_u & 0xff00))
-				max_u &= 0xff;
-			if ((max_v & 0xff00) == (min_v & 0xff00))
-				max_v &= 0xff;
-		}
-		else
-		{
-			// texture window so don't clamp texture
-			min_u = 0;
-			max_u = UINT16_MAX;
-			min_v = 0;
-			max_v = UINT16_MAX;
-		}
-	}
-
-	for (unsigned i = 0; i < count; i++)
-	{
-		vertices[i].texture_coord[0] += off_u;
-		vertices[i].texture_coord[1] += off_v;
-
-		vertices[i].texture_limits[0] = min_u;
-		vertices[i].texture_limits[1] = min_v;
-		vertices[i].texture_limits[2] = max_u;
-		vertices[i].texture_limits[3] = max_v;
-	}
-
-}

 static void vertex_preprocessing(
      GlRenderer *renderer,
@ -1838,7 +1699,6 @@ static void vertex_preprocessing(
   int16_t z = renderer->primitive_ordering;
   renderer->primitive_ordering += 1;
   
-   texCoord_preprocessing(renderer, v, count);

   for (unsigned i = 0; i < count; i++)
   {
@ -2537,6 +2397,8 @@ void rsx_gl_push_quad(  float p0x, float p0y, float p0w,
                        uint16_t t1x, uint16_t t1y,
                        uint16_t t2x, uint16_t t2y,
                        uint16_t t3x, uint16_t t3y,
+	                    uint16_t min_u, uint16_t min_v,
+                     	uint16_t max_u, uint16_t max_v,
                        uint16_t texpage_x, uint16_t texpage_y,
                        uint16_t clut_x, uint16_t clut_y,
                        uint8_t texture_blend_mode,
@ -2591,6 +2453,7 @@ void rsx_gl_push_quad(  float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 {min_u, min_v, max_u, max_v},
      },
      {
         {p1x, p1y, 0.95, p1w }, /* position */
@ -2602,6 +2465,7 @@ void rsx_gl_push_quad(  float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 {min_u, min_v, max_u, max_v},
      },
      {
         {p2x, p2y, 0.95, p2w }, /* position */
@ -2613,6 +2477,7 @@ void rsx_gl_push_quad(  float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 {min_u, min_v, max_u, max_v},
      },
      {
         {p3x, p3y, 0.95, p3w }, /* position */
@ -2624,6 +2489,7 @@ void rsx_gl_push_quad(  float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 { min_u, min_v, max_u, max_v },
      },
   };

@ -2652,6 +2518,8 @@ void rsx_gl_push_triangle( float p0x, float p0y, float p0w,
                           uint16_t t0x, uint16_t t0y,
                           uint16_t t1x, uint16_t t1y,
                           uint16_t t2x, uint16_t t2y,
+						   uint16_t min_u, uint16_t min_v,
+						   uint16_t max_u, uint16_t max_v,
                           uint16_t texpage_x, uint16_t texpage_y,
                           uint16_t clut_x, uint16_t clut_y,
                           uint8_t texture_blend_mode,
@ -2705,6 +2573,7 @@ void rsx_gl_push_triangle( float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 {min_u, min_v, max_u, max_v},
      },
      {
         {p1x, p1y, 0.95, p1w }, /* position */
@ -2716,6 +2585,7 @@ void rsx_gl_push_triangle( float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 {min_u, min_v, max_u, max_v},
      },
      {
         {p2x, p2y, 0.95, p2w }, /* position */
@ -2727,6 +2597,7 @@ void rsx_gl_push_triangle( float p0x, float p0y, float p0w,
         depth_shift,
         (uint8_t) dither,
         semi_transparent,
+		 {min_u, min_v, max_u, max_v},
      }
   };

--- a/rsx/rsx_lib_gl.h
+++ b/rsx/rsx_lib_gl.h
@ -39,6 +39,8 @@ void rsx_gl_push_triangle( float p0x, float p0y, float p0w,
                           uint16_t t0x, uint16_t t0y,
                           uint16_t t1x, uint16_t t1y,
                           uint16_t t2x, uint16_t t2y,
+	                       uint16_t min_u, uint16_t min_v,
+	                       uint16_t max_u, uint16_t max_v,
                           uint16_t texpage_x, uint16_t texpage_y,
                           uint16_t clut_x, uint16_t clut_y,
                           uint8_t texture_blend_mode,
@ -59,6 +61,8 @@ void rsx_gl_push_quad(  float p0x, float p0y, float p0w,
                        uint16_t t1x, uint16_t t1y,
                        uint16_t t2x, uint16_t t2y,
                        uint16_t t3x, uint16_t t3y,
+	                    uint16_t min_u, uint16_t min_v,
+	                    uint16_t max_u, uint16_t max_v,
                        uint16_t texpage_x, uint16_t texpage_y,
                        uint16_t clut_x, uint16_t clut_y,
                        uint8_t texture_blend_mode,
--- a/rsx/rsx_lib_vulkan.cpp
+++ b/rsx/rsx_lib_vulkan.cpp
@ -361,6 +361,8 @@ void rsx_vulkan_push_quad(
      uint16_t t1x, uint16_t t1y,
      uint16_t t2x, uint16_t t2y,
      uint16_t t3x, uint16_t t3y,
+	  uint16_t min_u, uint16_t min_v,
+	  uint16_t max_u, uint16_t max_v,
      uint16_t texpage_x, uint16_t texpage_y,
      uint16_t clut_x, uint16_t clut_y,
      uint8_t texture_blend_mode,
@ -377,6 +379,7 @@ void rsx_vulkan_push_quad(
   renderer->set_dither(dither);
   renderer->set_mask_test(mask_test);
   renderer->set_force_mask_bit(set_mask);
+   renderer->set_UV_limits(min_u, min_v, max_u, max_v);
   if (texture_blend_mode != 0)
   {
      switch (depth_shift)
@ -436,6 +439,8 @@ void rsx_vulkan_push_triangle(
      uint16_t t0x, uint16_t t0y,
      uint16_t t1x, uint16_t t1y,
      uint16_t t2x, uint16_t t2y,
+	  uint16_t min_u, uint16_t min_v,
+	  uint16_t max_u, uint16_t max_v,
      uint16_t texpage_x, uint16_t texpage_y,
      uint16_t clut_x, uint16_t clut_y,
      uint8_t texture_blend_mode,
@ -452,6 +457,7 @@ void rsx_vulkan_push_triangle(
   renderer->set_dither(dither);
   renderer->set_mask_test(mask_test);
   renderer->set_force_mask_bit(set_mask);
+   renderer->set_UV_limits(min_u, min_v, max_u, max_v);
   if (texture_blend_mode != 0)
   {
      switch (depth_shift)
--- a/rsx/rsx_lib_vulkan.h
+++ b/rsx/rsx_lib_vulkan.h
@ -35,6 +35,8 @@ void rsx_vulkan_push_triangle(float p0x, float p0y, float p0w,
                              uint16_t t0x, uint16_t t0y,
                              uint16_t t1x, uint16_t t1y,
                              uint16_t t2x, uint16_t t2y,
+							  uint16_t min_u, uint16_t min_v,
+							  uint16_t max_u, uint16_t max_v,
                              uint16_t texpage_x, uint16_t texpage_y,
                              uint16_t clut_x, uint16_t clut_y,
                              uint8_t texture_blend_mode,
@ -51,6 +53,8 @@ void rsx_vulkan_push_quad(float p0x, float p0y, float p0w,
                          uint16_t t1x, uint16_t t1y,
                          uint16_t t2x, uint16_t t2y,
                          uint16_t t3x, uint16_t t3y,
+						  uint16_t min_u, uint16_t min_v,
+						  uint16_t max_u, uint16_t max_v,
                          uint16_t texpage_x, uint16_t texpage_y,
                          uint16_t clut_x, uint16_t clut_y,
                          uint8_t texture_blend_mode,