2024-12-02 14:47:21 +00:00 · 2008-07-06 00:02:19 +00:00 · 2008-07-06 00:02:19 +00:00 · f559ee9342
commit f559ee9342
parent 0dac042178
2 changed files with 258 additions and 10 deletions
--- a/gsdx/GSBlock.h
+++ b/gsdx/GSBlock.h
@ -1436,6 +1436,262 @@ public:
 		#endif
 	}

+	__forceinline static void UnpackAndWriteBlock4HL(BYTE* src, int srcpitch, BYTE* dst)
+	{
+		#if _M_SSE >= 0x301
+
+		GSVector4i mask(0x0f0f0f0f);
+		GSVector4i mask0 = m_uw8hmask0;
+		GSVector4i mask1 = m_uw8hmask1;
+		GSVector4i mask2 = m_uw8hmask2;
+		GSVector4i mask3 = m_uw8hmask3;
+		GSVector4i mask4(0x0f000000);
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(
+				*(DWORD*)&src[srcpitch * 0], 
+				*(DWORD*)&src[srcpitch * 1], 
+				*(DWORD*)&src[srcpitch * 2], 
+				*(DWORD*)&src[srcpitch * 3]);
+
+			GSVector4i lo = v & mask;
+			GSVector4i hi = (v >> 4) & mask;
+
+			{
+				GSVector4i v4 = lo.upl8(hi);
+
+				GSVector4i v0 = v4.shuffle8(mask0);
+				GSVector4i v1 = v4.shuffle8(mask1);
+				GSVector4i v2 = v4.shuffle8(mask2);
+				GSVector4i v3 = v4.shuffle8(mask3);
+
+				((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask4);
+				((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask4);
+				((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask4);
+				((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask4);
+			}
+
+			{
+				GSVector4i v4 = lo.uph8(hi);
+
+				GSVector4i v0 = v4.shuffle8(mask0);
+				GSVector4i v1 = v4.shuffle8(mask1);
+				GSVector4i v2 = v4.shuffle8(mask2);
+				GSVector4i v3 = v4.shuffle8(mask3);
+
+				((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask4);
+				((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask4);
+				((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask4);
+				((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask4);
+			}
+		}
+
+		#elif _M_SSE >= 0x200
+/*
+		__declspec(align(16)) DWORD block[8 * 8];
+
+		UnpackBlock4HL(src, srcpitch, block);
+
+		WriteBlock32<true, 0x0f000000>(dst, (BYTE*)block, sizeof(block) / 8);
+*/
+		GSVector4i mask(0x0f0f0f0f);
+		GSVector4i mask2(0x0f000000);
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(
+				*(DWORD*)&src[srcpitch * 0], 
+				*(DWORD*)&src[srcpitch * 1], 
+				*(DWORD*)&src[srcpitch * 2], 
+				*(DWORD*)&src[srcpitch * 3]);
+
+			GSVector4i lo = v & mask;
+			GSVector4i hi = (v >> 4) & mask;
+
+			{
+				GSVector4i v4 = lo.upl8(hi);
+
+				GSVector4i v5 = v4.upl8(v4);
+				GSVector4i v6 = v4.uph8(v4);
+
+				GSVector4i v0 = v5.upl16(v5);
+				GSVector4i v1 = v5.uph16(v5);
+				GSVector4i v2 = v6.upl16(v6);
+				GSVector4i v3 = v6.uph16(v6);
+
+				GSVector4i::sw64(v0, v2, v1, v3);
+
+				((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask2);
+				((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask2);
+				((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask2);
+				((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask2);
+			}
+
+			{
+				GSVector4i v4 = lo.uph8(hi);
+
+				GSVector4i v5 = v4.upl8(v4);
+				GSVector4i v6 = v4.uph8(v4);
+
+				GSVector4i v0 = v5.upl16(v5);
+				GSVector4i v1 = v5.uph16(v5);
+				GSVector4i v2 = v6.upl16(v6);
+				GSVector4i v3 = v6.uph16(v6);
+
+				GSVector4i::sw64(v0, v2, v1, v3);
+
+				((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask2);
+				((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask2);
+				((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask2);
+				((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask2);
+			}
+		}
+
+		#else
+
+		const DWORD* d = &columnTable32[0][0];
+
+		for(int j = 0; j < 8; j++, d += 8, src += srcpitch)
+		{
+			for(int i = 0; i < 4; i++)
+			{
+				((DWORD*)dst)[d[i * 2 + 0]] = (((DWORD*)dst)[d[i * 2 + 0]] & ~0x0f000000) | ((src[i] & 0x0f) << 24);
+				((DWORD*)dst)[d[i * 2 + 1]] = (((DWORD*)dst)[d[i * 2 + 1]] & ~0x0f000000) | ((src[i] & 0xf0) << 20);
+			}
+		}
+
+		#endif
+	}
+
+	__forceinline static void UnpackAndWriteBlock4HH(BYTE* src, int srcpitch, BYTE* dst)
+	{
+		#if _M_SSE >= 0x301
+
+		GSVector4i mask(0xf0f0f0f0);
+		GSVector4i mask0 = m_uw8hmask0;
+		GSVector4i mask1 = m_uw8hmask1;
+		GSVector4i mask2 = m_uw8hmask2;
+		GSVector4i mask3 = m_uw8hmask3;
+		GSVector4i mask4(0xf0000000);
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(
+				*(DWORD*)&src[srcpitch * 0], 
+				*(DWORD*)&src[srcpitch * 1], 
+				*(DWORD*)&src[srcpitch * 2], 
+				*(DWORD*)&src[srcpitch * 3]);
+
+			GSVector4i lo = (v << 4) & mask;
+			GSVector4i hi = v & mask;
+
+			{
+				GSVector4i v4 = lo.upl8(hi);
+
+				GSVector4i v0 = v4.shuffle8(mask0);
+				GSVector4i v1 = v4.shuffle8(mask1);
+				GSVector4i v2 = v4.shuffle8(mask2);
+				GSVector4i v3 = v4.shuffle8(mask3);
+
+				((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask4);
+				((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask4);
+				((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask4);
+				((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask4);
+			}
+
+			{
+				GSVector4i v4 = lo.uph8(hi);
+
+				GSVector4i v0 = v4.shuffle8(mask0);
+				GSVector4i v1 = v4.shuffle8(mask1);
+				GSVector4i v2 = v4.shuffle8(mask2);
+				GSVector4i v3 = v4.shuffle8(mask3);
+
+				((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask4);
+				((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask4);
+				((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask4);
+				((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask4);
+			}
+		}
+
+		#elif _M_SSE >= 0x200
+/*
+		__declspec(align(16)) DWORD block[8 * 8];
+
+		UnpackBlock4HH(src, srcpitch, block);
+
+		WriteBlock32<true, 0xf0000000>(dst, (BYTE*)block, sizeof(block) / 8);
+*/
+		GSVector4i mask(0xf0f0f0f0);
+		GSVector4i mask2(0xf0000000);
+
+		for(int i = 0; i < 2; i++, src += srcpitch * 4)
+		{
+			GSVector4i v(
+				*(DWORD*)&src[srcpitch * 0], 
+				*(DWORD*)&src[srcpitch * 1], 
+				*(DWORD*)&src[srcpitch * 2], 
+				*(DWORD*)&src[srcpitch * 3]);
+
+			GSVector4i lo = (v << 4) & mask;
+			GSVector4i hi = v & mask;
+
+			{
+				GSVector4i v4 = lo.upl8(hi);
+
+				GSVector4i v5 = v4.upl8(v4);
+				GSVector4i v6 = v4.uph8(v4);
+
+				GSVector4i v0 = v5.upl16(v5);
+				GSVector4i v1 = v5.uph16(v5);
+				GSVector4i v2 = v6.upl16(v6);
+				GSVector4i v3 = v6.uph16(v6);
+
+				GSVector4i::sw64(v0, v2, v1, v3);
+
+				((GSVector4i*)dst)[i * 8 + 0] = ((GSVector4i*)dst)[i * 8 + 0].blend(v0, mask2);
+				((GSVector4i*)dst)[i * 8 + 1] = ((GSVector4i*)dst)[i * 8 + 1].blend(v1, mask2);
+				((GSVector4i*)dst)[i * 8 + 2] = ((GSVector4i*)dst)[i * 8 + 2].blend(v2, mask2);
+				((GSVector4i*)dst)[i * 8 + 3] = ((GSVector4i*)dst)[i * 8 + 3].blend(v3, mask2);
+			}
+
+			{
+				GSVector4i v4 = lo.uph8(hi);
+
+				GSVector4i v5 = v4.upl8(v4);
+				GSVector4i v6 = v4.uph8(v4);
+
+				GSVector4i v0 = v5.upl16(v5);
+				GSVector4i v1 = v5.uph16(v5);
+				GSVector4i v2 = v6.upl16(v6);
+				GSVector4i v3 = v6.uph16(v6);
+
+				GSVector4i::sw64(v0, v2, v1, v3);
+
+				((GSVector4i*)dst)[i * 8 + 4] = ((GSVector4i*)dst)[i * 8 + 4].blend(v0, mask2);
+				((GSVector4i*)dst)[i * 8 + 5] = ((GSVector4i*)dst)[i * 8 + 5].blend(v1, mask2);
+				((GSVector4i*)dst)[i * 8 + 6] = ((GSVector4i*)dst)[i * 8 + 6].blend(v2, mask2);
+				((GSVector4i*)dst)[i * 8 + 7] = ((GSVector4i*)dst)[i * 8 + 7].blend(v3, mask2);
+			}
+		}
+
+		#else
+
+		const DWORD* d = &columnTable32[0][0];
+
+		for(int j = 0; j < 8; j++, d += 8, src += srcpitch)
+		{
+			for(int i = 0; i < 4; i++)
+			{
+				((DWORD*)dst)[d[i * 2 + 0]] = (((DWORD*)dst)[d[i * 2 + 0]] & ~0xf0000000) | ((src[i] & 0x0f) << 28);
+				((DWORD*)dst)[d[i * 2 + 1]] = (((DWORD*)dst)[d[i * 2 + 1]] & ~0xf0000000) | ((src[i] & 0xf0) << 24);
+			}
+		}
+
+		#endif
+	}
+
 	template<bool AEM> __forceinline static void ReadAndExpandBlock24(BYTE* src, BYTE* dst, int dstpitch, const GIFRegTEXA& TEXA)
 	{
 		#if _M_SSE >= 0x200
--- a/gsdx/GSLocalMemory.cpp
+++ b/gsdx/GSLocalMemory.cpp
@ -1354,17 +1354,13 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, BYTE* src, int len, GIFRegBI
 	}
 	else
 	{
-		__declspec(align(16)) DWORD block[8 * 8];
-
 		th += ty;

 		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackBlock4HL(src + (x - tx) / 2, srcpitch, block);
-
-				WriteBlock32<true, 0x0f000000>((BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], (BYTE*)block, sizeof(block) / 8);
+				UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]);
 			}
 		}

@ -1389,17 +1385,13 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, BYTE* src, int len, GIFRegBI
 	}
 	else
 	{
-		__declspec(align(16)) DWORD block[8 * 8];
-
 		th += ty;

 		for(int y = ty; y < th; y += 8, src += srcpitch * 8)
 		{
 			for(int x = tx; x < tw; x += 8)
 			{
-				UnpackBlock4HH(src + (x - tx) / 2, srcpitch, block);
-
-				WriteBlock32<true, 0xf0000000>((BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], (BYTE*)block, sizeof(block) / 8);
+				UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]);
 			}
 		}