From 32e7a3477142bfb8572a5d1c85d6a723986f0cc6 Mon Sep 17 00:00:00 2001 From: gabest Date: Tue, 8 Jul 2008 16:59:38 +0000 Subject: [PATCH] --- gsdx/GSBlock.h | 36 +++-- gsdx/GSLocalMemory.cpp | 296 +++++++++++++++++++++++----------------- gsdx/GSLocalMemory.h | 38 ++++-- gsdx/GSState.cpp | 29 ++-- gsdx/GSVector.h | 143 +++++++++++++++---- gsdx/GSdx.cpp | 45 +++++- gsdx/GSdx_vs2008.vcproj | 1 + gsdx/sse.h | 2 +- sse4.vsprops | 2 +- xpad/xpad_vs2008.vcproj | 1 + 10 files changed, 392 insertions(+), 201 deletions(-) diff --git a/gsdx/GSBlock.h b/gsdx/GSBlock.h index 564581e..f37c271 100644 --- a/gsdx/GSBlock.h +++ b/gsdx/GSBlock.h @@ -68,7 +68,7 @@ public: { GSVector4i v4((int)mask); - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 if(mask == 0xff000000 || mask == 0x00ffffff) { @@ -87,7 +87,7 @@ public: ((GSVector4i*)dst)[i * 4 + 2] = ((GSVector4i*)dst)[i * 4 + 2].blend(v2, v4); ((GSVector4i*)dst)[i * 4 + 3] = ((GSVector4i*)dst)[i * 4 + 3].blend(v3, v4); - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 } @@ -1090,7 +1090,7 @@ public: { for(int j = 0; j < 16; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1112,7 +1112,7 @@ public: { for(int j = 0; j < 16; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1134,7 +1134,7 @@ public: { for(int j = 0; j < 16; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1156,7 +1156,7 @@ public: { for(int j = 0; j < 16; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1178,7 +1178,7 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1201,7 +1201,7 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1226,7 +1226,7 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1249,7 +1249,7 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1274,7 +1274,7 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1297,7 +1297,7 @@ public: { for(int j = 0; j < 8; j++, dst += dstpitch) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)dst; @@ -1318,8 +1318,6 @@ public: } } - // TODO: UnpackAndWrite* - __forceinline static void UnpackAndWriteBlock24(BYTE* src, int srcpitch, BYTE* dst) { #if _M_SSE >= 0x200 @@ -1755,7 +1753,7 @@ public: __forceinline static void ReadAndExpandBlock8_32(BYTE* src, BYTE* dst, int dstpitch, DWORD* pal) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i v0, v1, v2, v3; @@ -1825,7 +1823,7 @@ public: __forceinline static void ReadAndExpandBlock4_32(BYTE* src, BYTE* dst, int dstpitch, UINT64* pal) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i v0, v1, v2, v3; @@ -1917,7 +1915,7 @@ public: __forceinline static void ReadAndExpandBlock8H_32(BYTE* src, BYTE* dst, int dstpitch, DWORD* pal) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i v0, v1, v2, v3; @@ -1968,7 +1966,7 @@ public: __forceinline static void ReadAndExpandBlock4HL_32(BYTE* src, BYTE* dst, int dstpitch, DWORD* pal) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i v0, v1, v2, v3; @@ -2019,7 +2017,7 @@ public: __forceinline static void ReadAndExpandBlock4HH_32(BYTE* src, BYTE* dst, int dstpitch, DWORD* pal) { - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i v0, v1, v2, v3; diff --git a/gsdx/GSLocalMemory.cpp b/gsdx/GSLocalMemory.cpp index 9599256..e177f87 100644 --- a/gsdx/GSLocalMemory.cpp +++ b/gsdx/GSLocalMemory.cpp @@ -30,11 +30,13 @@ #define ASSERT_BLOCK(r, w, h) \ ASSERT((r).Width() >= w && (r).Height() >= h && !((r).left&(w-1)) && !((r).top&(h-1)) && !((r).right&(w-1)) && !((r).bottom&(h-1))); \ -#define FOREACH_BLOCK_START(r, w, h, t) \ - for(int y = (r).top; y < (r).bottom; y += (h)) \ - { ASSERT_BLOCK(r, w, h); \ - BYTE* ptr = dst + (y-(r).top)*dstpitch; \ - for(int x = (r).left; x < (r).right; x += (w)) \ +#define FOREACH_BLOCK_START(w, h, bpp) \ + DWORD bp = TEX0.TBP0; \ + DWORD bw = TEX0.TBW; \ + int offset = dstpitch * h - (r.right - r.left) * bpp / 8; \ + for(int y = r.top; y < r.bottom; y += h, dst += offset) \ + { ASSERT_BLOCK(r, w, h); \ + for(int x = r.left; x < r.right; x += w, dst += w * bpp / 8) \ { \ #define FOREACH_BLOCK_END }} @@ -813,6 +815,9 @@ void GSLocalMemory::WriteImage32(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 4; int th = len / srcpitch; @@ -832,14 +837,14 @@ void GSLocalMemory::WriteImage32(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < twa; x += 8) { - WriteBlock32((BYTE*)&m_vm32[BlockAddress32(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteBlock32((BYTE*)&m_vm32[BlockAddress32(x, ty, bp, bw)], src + (x - tx) * 4, srcpitch); } for(int i = 0; i < 8; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel32(x, ty, ((DWORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel32(x, ty, ((DWORD*)src)[x - tx], bp, bw); } } } @@ -857,14 +862,14 @@ void GSLocalMemory::WriteImage32(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < twa; x += 8) { - WriteColumn32(ty, (BYTE*)&m_vm32[BlockAddress32(x, ty & ~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteColumn32(ty, (BYTE*)&m_vm32[BlockAddress32(x, ty & ~7, bp, bw)], src + (x - tx) * 4, srcpitch); } for(int i = 0; i < 2; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel32(x, ty, ((DWORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel32(x, ty, ((DWORD*)src)[x - tx], bp, bw); } } } @@ -882,7 +887,7 @@ void GSLocalMemory::WriteImage32(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < tw; x += 8) { - WriteBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], src + (x - tx) * 4, srcpitch); } } } @@ -892,7 +897,7 @@ void GSLocalMemory::WriteImage32(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < tw; x += 8) { - WriteBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], src + (x - tx) * 4, srcpitch); } } } @@ -905,6 +910,9 @@ void GSLocalMemory::WriteImage24(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 3; int th = len / srcpitch; @@ -924,7 +932,7 @@ void GSLocalMemory::WriteImage24(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]); + UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)]); } } @@ -936,6 +944,9 @@ void GSLocalMemory::WriteImage16(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 2; int th = len / srcpitch; @@ -955,14 +966,14 @@ void GSLocalMemory::WriteImage16(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < twa; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16(x, ty, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 8; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -980,14 +991,14 @@ void GSLocalMemory::WriteImage16(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < twa; x += 16) { - WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16(x, ty & ~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16(x, ty & ~7, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 2; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1005,7 +1016,7 @@ void GSLocalMemory::WriteImage16(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1015,7 +1026,7 @@ void GSLocalMemory::WriteImage16(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1028,6 +1039,9 @@ void GSLocalMemory::WriteImage16S(int& tx, int& ty, BYTE* src, int len, GIFRegBI { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 2; int th = len / srcpitch; @@ -1047,14 +1061,14 @@ void GSLocalMemory::WriteImage16S(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < twa; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16S(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16S(x, ty, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 8; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16S(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16S(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1072,14 +1086,14 @@ void GSLocalMemory::WriteImage16S(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < twa; x += 16) { - WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16S(x, ty & ~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16S(x, ty & ~7, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 2; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16S(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16S(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1097,7 +1111,7 @@ void GSLocalMemory::WriteImage16S(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1107,7 +1121,7 @@ void GSLocalMemory::WriteImage16S(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1120,6 +1134,9 @@ void GSLocalMemory::WriteImage8(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = TRXREG.RRW - TRXPOS.DSAX; int th = len / srcpitch; @@ -1139,14 +1156,14 @@ void GSLocalMemory::WriteImage8(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < twa; x += 16) { - WriteBlock8((BYTE*)&m_vm8[BlockAddress8(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch); + WriteBlock8((BYTE*)&m_vm8[BlockAddress8(x, ty, bp, bw)], src + (x - tx), srcpitch); } for(int i = 0; i < 16; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel8(x, ty, src[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel8(x, ty, src[x - tx], bp, bw); } } } @@ -1164,14 +1181,14 @@ void GSLocalMemory::WriteImage8(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < twa; x += 16) { - WriteColumn8(ty, (BYTE*)&m_vm8[BlockAddress8(x, ty & ~15, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch); + WriteColumn8(ty, (BYTE*)&m_vm8[BlockAddress8(x, ty & ~15, bp, bw)], src + (x - tx), srcpitch); } for(int i = 0; i < 4; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel8(x, ty, src[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel8(x, ty, src[x - tx], bp, bw); } } } @@ -1189,7 +1206,7 @@ void GSLocalMemory::WriteImage8(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < tw; x += 16) { - WriteBlock8((BYTE*)&m_vm8[BlockAddress8(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch); + WriteBlock8((BYTE*)&m_vm8[BlockAddress8(x, y, bp, bw)], src + (x - tx), srcpitch); } } } @@ -1199,7 +1216,7 @@ void GSLocalMemory::WriteImage8(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < tw; x += 16) { - WriteBlock8((BYTE*)&m_vm8[BlockAddress8(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx), srcpitch); + WriteBlock8((BYTE*)&m_vm8[BlockAddress8(x, y, bp, bw)], src + (x - tx), srcpitch); } } } @@ -1212,6 +1229,9 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) / 2; int th = len / srcpitch; @@ -1231,7 +1251,7 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < twa; x += 32) { - WriteBlock4((BYTE*)&m_vm8[BlockAddress4(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW) >> 1], src + (x - tx) / 2, srcpitch); + WriteBlock4((BYTE*)&m_vm8[BlockAddress4(x, ty, bp, bw) >> 1], src + (x - tx) / 2, srcpitch); } for(int i = 0; i < 16; i++, ty++, src += srcpitch) @@ -1240,8 +1260,8 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB for(int x = twa; x < tw; x += 2, s++) { - WritePixel4(x, ty, *s & 0xf, BITBLTBUF.DBP, BITBLTBUF.DBW), - WritePixel4(x + 1, ty, *s >> 4, BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel4(x, ty, *s & 0xf, bp, bw), + WritePixel4(x + 1, ty, *s >> 4, bp, bw); } } } @@ -1259,7 +1279,7 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < twa; x += 32) { - WriteColumn4(ty, (BYTE*)&m_vm8[BlockAddress4(x, ty & ~15, BITBLTBUF.DBP, BITBLTBUF.DBW) >> 1], src + (x - tx) / 2, srcpitch); + WriteColumn4(ty, (BYTE*)&m_vm8[BlockAddress4(x, ty & ~15, bp, bw) >> 1], src + (x - tx) / 2, srcpitch); } for(int i = 0; i < 4; i++, ty++, src += srcpitch) @@ -1268,8 +1288,8 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB for(int x = twa; x < tw; x += 2, s++) { - WritePixel4(x, ty, *s & 0xf, BITBLTBUF.DBP, BITBLTBUF.DBW), - WritePixel4(x + 1, ty, *s >> 4, BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel4(x, ty, *s & 0xf, bp, bw), + WritePixel4(x + 1, ty, *s >> 4, bp, bw); } } } @@ -1287,7 +1307,7 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < tw; x += 32) { - WriteBlock4((BYTE*)&m_vm8[BlockAddress4(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW) >> 1], src + (x - tx) / 2, srcpitch); + WriteBlock4((BYTE*)&m_vm8[BlockAddress4(x, y, bp, bw) >> 1], src + (x - tx) / 2, srcpitch); } } } @@ -1297,7 +1317,7 @@ void GSLocalMemory::WriteImage4(int& tx, int& ty, BYTE* src, int len, GIFRegBITB { for(int x = tx; x < tw; x += 32) { - WriteBlock4((BYTE*)&m_vm8[BlockAddress4(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW) >> 1], src + (x - tx) / 2, srcpitch); + WriteBlock4((BYTE*)&m_vm8[BlockAddress4(x, y, bp, bw) >> 1], src + (x - tx) / 2, srcpitch); } } } @@ -1310,6 +1330,9 @@ void GSLocalMemory::WriteImage8H(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = TRXREG.RRW - TRXPOS.DSAX; int th = len / srcpitch; @@ -1329,7 +1352,7 @@ void GSLocalMemory::WriteImage8H(int& tx, int& ty, BYTE* src, int len, GIFRegBIT { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock8H(src + (x - tx), srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]); + UnpackAndWriteBlock8H(src + (x - tx), srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)]); } } @@ -1341,6 +1364,9 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, BYTE* src, int len, GIFRegBI { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) / 2; int th = len / srcpitch; @@ -1360,7 +1386,7 @@ void GSLocalMemory::WriteImage4HL(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]); + UnpackAndWriteBlock4HL(src + (x - tx) / 2, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)]); } } @@ -1372,6 +1398,9 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, BYTE* src, int len, GIFRegBI { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) / 2; int th = len / srcpitch; @@ -1391,7 +1420,7 @@ void GSLocalMemory::WriteImage4HH(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]); + UnpackAndWriteBlock4HH(src + (x - tx) / 2, srcpitch, (BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)]); } } @@ -1403,6 +1432,9 @@ void GSLocalMemory::WriteImage32Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 4; int th = len / srcpitch; @@ -1422,14 +1454,14 @@ void GSLocalMemory::WriteImage32Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < twa; x += 8) { - WriteBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, ty, bp, bw)], src + (x - tx) * 4, srcpitch); } for(int i = 0; i < 8; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel32Z(x, ty, ((DWORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel32Z(x, ty, ((DWORD*)src)[x - tx], bp, bw); } } } @@ -1447,14 +1479,14 @@ void GSLocalMemory::WriteImage32Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < twa; x += 8) { - WriteColumn32(ty, (BYTE*)&m_vm32[BlockAddress32Z(x, ty & ~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteColumn32(ty, (BYTE*)&m_vm32[BlockAddress32Z(x, ty & ~7, bp, bw)], src + (x - tx) * 4, srcpitch); } for(int i = 0; i < 2; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel32Z(x, ty, ((DWORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel32Z(x, ty, ((DWORD*)src)[x - tx], bp, bw); } } } @@ -1472,7 +1504,7 @@ void GSLocalMemory::WriteImage32Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 8) { - WriteBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, y, bp, bw)], src + (x - tx) * 4, srcpitch); } } } @@ -1482,7 +1514,7 @@ void GSLocalMemory::WriteImage32Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 8) { - WriteBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 4, srcpitch); + WriteBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, y, bp, bw)], src + (x - tx) * 4, srcpitch); } } } @@ -1495,6 +1527,9 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 3; int th = len / srcpitch; @@ -1514,7 +1549,7 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 8) { - UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, (BYTE*)&m_vm32[BlockAddress32Z(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)]); + UnpackAndWriteBlock24(src + (x - tx) * 3, srcpitch, (BYTE*)&m_vm32[BlockAddress32Z(x, y, bp, bw)]); } } @@ -1526,6 +1561,9 @@ void GSLocalMemory::WriteImage16Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 2; int th = len / srcpitch; @@ -1545,14 +1583,14 @@ void GSLocalMemory::WriteImage16Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < twa; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, ty, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 8; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16Z(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16Z(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1570,14 +1608,14 @@ void GSLocalMemory::WriteImage16Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < twa; x += 16) { - WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16Z(x, ty & ~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16Z(x, ty & ~7, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 2; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16Z(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16Z(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1595,7 +1633,7 @@ void GSLocalMemory::WriteImage16Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1605,7 +1643,7 @@ void GSLocalMemory::WriteImage16Z(int& tx, int& ty, BYTE* src, int len, GIFRegBI { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1618,6 +1656,9 @@ void GSLocalMemory::WriteImage16SZ(int& tx, int& ty, BYTE* src, int len, GIFRegB { if(TRXREG.RRW == 0) return; + DWORD bp = BITBLTBUF.DBP; + DWORD bw = BITBLTBUF.DBW; + int tw = TRXREG.RRW, srcpitch = (TRXREG.RRW - TRXPOS.DSAX) * 2; int th = len / srcpitch; @@ -1637,14 +1678,14 @@ void GSLocalMemory::WriteImage16SZ(int& tx, int& ty, BYTE* src, int len, GIFRegB { for(int x = tx; x < twa; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, ty, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, ty, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 8; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16SZ(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16SZ(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1662,14 +1703,14 @@ void GSLocalMemory::WriteImage16SZ(int& tx, int& ty, BYTE* src, int len, GIFRegB { for(int x = tx; x < twa; x += 16) { - WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16SZ(x, ty & ~7, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteColumn16(ty, (BYTE*)&m_vm16[BlockAddress16SZ(x, ty & ~7, bp, bw)], src + (x - tx) * 2, srcpitch); } for(int i = 0; i < 2; i++, ty++, src += srcpitch) { for(int x = twa; x < tw; x++) { - WritePixel16SZ(x, ty, ((WORD*)src)[x - tx], BITBLTBUF.DBP, BITBLTBUF.DBW); + WritePixel16SZ(x, ty, ((WORD*)src)[x - tx], bp, bw); } } } @@ -1687,7 +1728,7 @@ void GSLocalMemory::WriteImage16SZ(int& tx, int& ty, BYTE* src, int len, GIFRegB { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -1697,7 +1738,7 @@ void GSLocalMemory::WriteImage16SZ(int& tx, int& ty, BYTE* src, int len, GIFRegB { for(int x = tx; x < tw; x += 16) { - WriteBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, BITBLTBUF.DBP, BITBLTBUF.DBW)], src + (x - tx) * 2, srcpitch); + WriteBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, bp, bw)], src + (x - tx) * 2, srcpitch); } } } @@ -2065,28 +2106,29 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, BYTE* dst, int len, GIFRegBITBL void GSLocalMemory::ReadTexture32(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch); + ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch); } FOREACH_BLOCK_END + } void GSLocalMemory::ReadTexture24(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { if(TEXA.AEM) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, TEXA); + ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, TEXA); } FOREACH_BLOCK_END } else { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, TEXA); + ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -2096,11 +2138,11 @@ void GSLocalMemory::ReadTexture16(const CRect& r, BYTE* dst, int dstpitch, GIFRe { __declspec(align(16)) WORD block[16 * 8]; - FOREACH_BLOCK_START(r, 16, 8, 16) + FOREACH_BLOCK_START(16, 8, 32) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock16(block, ptr + (x - r.left) * 4, dstpitch, TEXA); + ExpandBlock16(block, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -2109,11 +2151,11 @@ void GSLocalMemory::ReadTexture16S(const CRect& r, BYTE* dst, int dstpitch, GIFR { __declspec(align(16)) WORD block[16 * 8]; - FOREACH_BLOCK_START(r, 16, 8, 16S) + FOREACH_BLOCK_START(16, 8, 32) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock16(block, ptr + (x - r.left) * 4, dstpitch, TEXA); + ExpandBlock16(block, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -2122,9 +2164,9 @@ void GSLocalMemory::ReadTexture8(const CRect& r, BYTE* dst, int dstpitch, GIFReg { DWORD* pal = m_clut32; - FOREACH_BLOCK_START(r, 16, 16, 8) + FOREACH_BLOCK_START(16, 16, 32) { - ReadAndExpandBlock8_32((BYTE*)&m_vm8[BlockAddress8(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock8_32((BYTE*)&m_vm8[BlockAddress8(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2133,9 +2175,9 @@ void GSLocalMemory::ReadTexture4(const CRect& r, BYTE* dst, int dstpitch, GIFReg { UINT64* pal = m_clut64; - FOREACH_BLOCK_START(r, 32, 16, 4) + FOREACH_BLOCK_START(32, 16, 32) { - ReadAndExpandBlock4_32((BYTE*)&m_vm8[BlockAddress4(x, y, TEX0.TBP0, TEX0.TBW) >> 1], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock4_32((BYTE*)&m_vm8[BlockAddress4(x, y, bp, bw) >> 1], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2144,9 +2186,9 @@ void GSLocalMemory::ReadTexture8H(const CRect& r, BYTE* dst, int dstpitch, GIFRe { DWORD* pal = m_clut32; - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock8H_32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock8H_32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2155,9 +2197,9 @@ void GSLocalMemory::ReadTexture4HL(const CRect& r, BYTE* dst, int dstpitch, GIFR { DWORD* pal = m_clut32; - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock4HL_32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock4HL_32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2166,18 +2208,18 @@ void GSLocalMemory::ReadTexture4HH(const CRect& r, BYTE* dst, int dstpitch, GIFR { DWORD* pal = m_clut32; - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock4HH_32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock4HH_32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } void GSLocalMemory::ReadTexture32Z(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch); + ReadBlock32((BYTE*)&m_vm32[BlockAddress32Z(x, y, bp, bw)], dst, dstpitch); } FOREACH_BLOCK_END } @@ -2186,17 +2228,17 @@ void GSLocalMemory::ReadTexture24Z(const CRect& r, BYTE* dst, int dstpitch, GIFR { if(TEXA.AEM) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32Z(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, TEXA); + ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32Z(x, y, bp, bw)], dst, dstpitch, TEXA); } FOREACH_BLOCK_END } else { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32Z(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, TEXA); + ReadAndExpandBlock24((BYTE*)&m_vm32[BlockAddress32Z(x, y, bp, bw)], dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -2206,11 +2248,11 @@ void GSLocalMemory::ReadTexture16Z(const CRect& r, BYTE* dst, int dstpitch, GIFR { __declspec(align(16)) WORD block[16 * 8]; - FOREACH_BLOCK_START(r, 16, 8, 16) + FOREACH_BLOCK_START(16, 8, 32) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock16(block, ptr + (x - r.left) * 4, dstpitch, TEXA); + ExpandBlock16(block, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -2219,11 +2261,11 @@ void GSLocalMemory::ReadTexture16SZ(const CRect& r, BYTE* dst, int dstpitch, GIF { __declspec(align(16)) WORD block[16 * 8]; - FOREACH_BLOCK_START(r, 16, 8, 16S) + FOREACH_BLOCK_START(16, 8, 32) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock16(block, ptr + (x - r.left) * 4, dstpitch, TEXA); + ExpandBlock16(block, dst, dstpitch, TEXA); } FOREACH_BLOCK_END } @@ -2270,18 +2312,18 @@ void GSLocalMemory::ReadTextureNC(const CRect& r, BYTE* dst, int dstpitch, GIFRe void GSLocalMemory::ReadTexture16NP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { - FOREACH_BLOCK_START(r, 16, 8, 16) + FOREACH_BLOCK_START(16, 8, 16) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 2, dstpitch); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16(x, y, bp, bw)], dst, dstpitch); } FOREACH_BLOCK_END } void GSLocalMemory::ReadTexture16SNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { - FOREACH_BLOCK_START(r, 16, 8, 16S) + FOREACH_BLOCK_START(16, 8, 16) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 2, dstpitch); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16S(x, y, bp, bw)], dst, dstpitch); } FOREACH_BLOCK_END } @@ -2292,9 +2334,9 @@ void GSLocalMemory::ReadTexture8NP(const CRect& r, BYTE* dst, int dstpitch, GIFR if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24) { - FOREACH_BLOCK_START(r, 16, 16, 8) + FOREACH_BLOCK_START(16, 16, 32) { - ReadAndExpandBlock8_32((BYTE*)&m_vm8[BlockAddress8(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock8_32((BYTE*)&m_vm8[BlockAddress8(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2304,11 +2346,11 @@ void GSLocalMemory::ReadTexture8NP(const CRect& r, BYTE* dst, int dstpitch, GIFR __declspec(align(16)) BYTE block[16 * 16]; - FOREACH_BLOCK_START(r, 16, 16, 8) + FOREACH_BLOCK_START(16, 16, 16) { - ReadBlock8((BYTE*)&m_vm8[BlockAddress8(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 16); + ReadBlock8((BYTE*)&m_vm8[BlockAddress8(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 16); - ExpandBlock8_16(block, ptr + (x - r.left) * 2, dstpitch, pal); + ExpandBlock8_16(block, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2320,9 +2362,9 @@ void GSLocalMemory::ReadTexture4NP(const CRect& r, BYTE* dst, int dstpitch, GIFR if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24) { - FOREACH_BLOCK_START(r, 32, 16, 4) + FOREACH_BLOCK_START(32, 16, 32) { - ReadAndExpandBlock4_32((BYTE*)&m_vm8[BlockAddress4(x, y, TEX0.TBP0, TEX0.TBW) >> 1], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock4_32((BYTE*)&m_vm8[BlockAddress4(x, y, bp, bw) >> 1], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2332,11 +2374,11 @@ void GSLocalMemory::ReadTexture4NP(const CRect& r, BYTE* dst, int dstpitch, GIFR __declspec(align(16)) BYTE block[(32 / 2) * 16]; - FOREACH_BLOCK_START(r, 32, 16, 4) + FOREACH_BLOCK_START(32, 16, 16) { - ReadBlock4((BYTE*)&m_vm8[BlockAddress4(x, y, TEX0.TBP0, TEX0.TBW)>>1], (BYTE*)block, sizeof(block) / 16); + ReadBlock4((BYTE*)&m_vm8[BlockAddress4(x, y, bp, bw)>>1], (BYTE*)block, sizeof(block) / 16); - ExpandBlock4_16(block, ptr + (x - r.left) * 2, dstpitch, pal); + ExpandBlock4_16(block, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2348,9 +2390,9 @@ void GSLocalMemory::ReadTexture8HNP(const CRect& r, BYTE* dst, int dstpitch, GIF if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock8H_32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock8H_32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2360,11 +2402,11 @@ void GSLocalMemory::ReadTexture8HNP(const CRect& r, BYTE* dst, int dstpitch, GIF __declspec(align(16)) DWORD block[8 * 8]; - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 16) { - ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock8H_16(block, ptr + (x - r.left) * 2, dstpitch, pal); + ExpandBlock8H_16(block, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2376,9 +2418,9 @@ void GSLocalMemory::ReadTexture4HLNP(const CRect& r, BYTE* dst, int dstpitch, GI if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock4HL_32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock4HL_32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2388,11 +2430,11 @@ void GSLocalMemory::ReadTexture4HLNP(const CRect& r, BYTE* dst, int dstpitch, GI __declspec(align(16)) DWORD block[8 * 8]; - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 16) { - ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock4HL_16(block, ptr + (x - r.left) * 2, dstpitch, pal); + ExpandBlock4HL_16(block, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2404,9 +2446,9 @@ void GSLocalMemory::ReadTexture4HHNP(const CRect& r, BYTE* dst, int dstpitch, GI if(TEX0.CPSM == PSM_PSMCT32 || TEX0.CPSM == PSM_PSMCT24) { - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 32) { - ReadAndExpandBlock4HH_32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 4, dstpitch, pal); + ReadAndExpandBlock4HH_32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2416,11 +2458,11 @@ void GSLocalMemory::ReadTexture4HHNP(const CRect& r, BYTE* dst, int dstpitch, GI __declspec(align(16)) DWORD block[8 * 8]; - FOREACH_BLOCK_START(r, 8, 8, 32) + FOREACH_BLOCK_START(8, 8, 16) { - ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, TEX0.TBP0, TEX0.TBW)], (BYTE*)block, sizeof(block) / 8); + ReadBlock32((BYTE*)&m_vm32[BlockAddress32(x, y, bp, bw)], (BYTE*)block, sizeof(block) / 8); - ExpandBlock4HH_16(block, ptr + (x - r.left) * 2, dstpitch, pal); + ExpandBlock4HH_16(block, dst, dstpitch, pal); } FOREACH_BLOCK_END } @@ -2428,18 +2470,18 @@ void GSLocalMemory::ReadTexture4HHNP(const CRect& r, BYTE* dst, int dstpitch, GI void GSLocalMemory::ReadTexture16ZNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { - FOREACH_BLOCK_START(r, 16, 8, 16) + FOREACH_BLOCK_START(16, 8, 16) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 2, dstpitch); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16Z(x, y, bp, bw)], dst, dstpitch); } FOREACH_BLOCK_END } void GSLocalMemory::ReadTexture16SZNP(const CRect& r, BYTE* dst, int dstpitch, GIFRegTEX0& TEX0, GIFRegTEXA& TEXA) { - FOREACH_BLOCK_START(r, 16, 8, 16S) + FOREACH_BLOCK_START(16, 8, 16) { - ReadBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, TEX0.TBP0, TEX0.TBW)], ptr + (x - r.left) * 2, dstpitch); + ReadBlock16((BYTE*)&m_vm16[BlockAddress16SZ(x, y, bp, bw)], dst, dstpitch); } FOREACH_BLOCK_END } diff --git a/gsdx/GSLocalMemory.h b/gsdx/GSLocalMemory.h index 0463a14..fe5a92d 100644 --- a/gsdx/GSLocalMemory.h +++ b/gsdx/GSLocalMemory.h @@ -130,12 +130,14 @@ public: static DWORD PageAddress8(int x, int y, DWORD bp, DWORD bw) { - return ((bp >> 5) + (y >> 6) * ((bw + 1) >> 1) + (x >> 7)) << 13; + ASSERT((bw & 1) == 0); + return ((bp >> 5) + (y >> 6) * (bw >> 1) + (x >> 7)) << 13; } static DWORD PageAddress4(int x, int y, DWORD bp, DWORD bw) { - return ((bp >> 5) + (y >> 7) * ((bw + 1) >> 1) + (x >> 7)) << 14; + ASSERT((bw & 1) == 0); + return ((bp >> 5) + (y >> 7) * (bw >> 1) + (x >> 7)) << 14; } static DWORD BlockAddress32(int x, int y, DWORD bp, DWORD bw) @@ -161,14 +163,16 @@ public: static DWORD BlockAddress8(int x, int y, DWORD bp, DWORD bw) { - DWORD page = bp + ((y >> 1) & ~0x1f) * ((bw+1)>>1) + ((x >> 2) & ~0x1f); + ASSERT((bw & 1) == 0); + DWORD page = bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f); DWORD block = blockTable8[(y >> 4) & 3][(x >> 4) & 7]; return (page + block) << 8; } static DWORD BlockAddress4(int x, int y, DWORD bp, DWORD bw) { - DWORD page = bp + ((y >> 2) & ~0x1f) * ((bw+1)>>1) + ((x >> 2) & ~0x1f); + ASSERT((bw & 1) == 0); + DWORD page = bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f); DWORD block = blockTable4[(y >> 4) & 7][(x >> 5) & 3]; return (page + block) << 9; } @@ -223,16 +227,18 @@ public: static DWORD PixelAddressOrg8(int x, int y, DWORD bp, DWORD bw) { - DWORD page = bp + ((y >> 1) & ~0x1f) * ((bw + 1)>>1) + ((x >> 2) & ~0x1f); + ASSERT((bw & 1) == 0); + DWORD page = bp + ((y >> 1) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f); DWORD block = blockTable8[(y >> 4) & 3][(x >> 4) & 7]; DWORD word = ((page + block) << 8) + columnTable8[y & 15][x & 15]; - // ASSERT(word < 1024*1024*4); + ASSERT(word < 1024*1024*4); return word; } static DWORD PixelAddressOrg4(int x, int y, DWORD bp, DWORD bw) { - DWORD page = bp + ((y >> 2) & ~0x1f) * ((bw + 1)>>1) + ((x >> 2) & ~0x1f); + ASSERT((bw & 1) == 0); + DWORD page = bp + ((y >> 2) & ~0x1f) * (bw >> 1) + ((x >> 2) & ~0x1f); DWORD block = blockTable4[(y >> 4) & 7][(x >> 5) & 3]; DWORD word = ((page + block) << 9) + columnTable4[y & 15][x & 31]; ASSERT(word < 1024*1024*8); @@ -289,14 +295,16 @@ public: static __forceinline DWORD PixelAddress8(int x, int y, DWORD bp, DWORD bw) { - DWORD page = (bp >> 5) + (y >> 6) * ((bw + 1)>>1) + (x >> 7); + ASSERT((bw & 1) == 0); + DWORD page = (bp >> 5) + (y >> 6) * (bw >> 1) + (x >> 7); DWORD word = (page << 13) + pageOffset8[bp & 0x1f][y & 0x3f][x & 0x7f]; return word; } static __forceinline DWORD PixelAddress4(int x, int y, DWORD bp, DWORD bw) { - DWORD page = (bp >> 5) + (y >> 7) * ((bw + 1)>>1) + (x >> 7); + ASSERT((bw & 1) == 0); + DWORD page = (bp >> 5) + (y >> 7) * (bw >> 1) + (x >> 7); DWORD word = (page << 14) + pageOffset4[bp & 0x1f][y & 0x7f][x & 0x7f]; return word; } @@ -880,7 +888,7 @@ public: { case PSM_PSMCT32: case PSM_PSMZ32: - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 c = addr.gather32_32(m_vm32); #else c = GSVector4i( @@ -892,7 +900,7 @@ public: break; case PSM_PSMCT24: case PSM_PSMZ24: - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 c = addr.gather32_32(m_vm32); #else c = GSVector4i( @@ -907,7 +915,7 @@ public: case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 c = addr.gather32_32(m_vm16); #else c = GSVector4i( @@ -933,7 +941,7 @@ public: switch(PSM) { case PSM_PSMZ32: - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 z = addr.gather32_32(m_vm32); #else z = GSVector4i( @@ -944,7 +952,7 @@ public: #endif break; case PSM_PSMZ24: - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 z = addr.gather32_32(m_vm32) & 0x00ffffff; #else z = GSVector4i( @@ -957,7 +965,7 @@ public: break; case PSM_PSMZ16: case PSM_PSMZ16S: - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 z = addr.gather32_32(m_vm16); #else z = GSVector4i( diff --git a/gsdx/GSState.cpp b/gsdx/GSState.cpp index 4e72c51..fceaa57 100644 --- a/gsdx/GSState.cpp +++ b/gsdx/GSState.cpp @@ -608,6 +608,11 @@ template void GSState::GIFRegHandlerTEX0(GIFReg* r) FlushWrite(); m_mem.WriteCLUT(r->TEX0, m_env.TEXCLUT); + + if((m_env.CTXT[i].TEX0.TBW & 1) && (m_env.CTXT[i].TEX0.PSM == PSM_PSMT8 || m_env.CTXT[i].TEX0.PSM == PSM_PSMT4)) + { + m_env.CTXT[i].TEX0.TBW &= ~1; + } } template void GSState::GIFRegHandlerCLAMP(GIFReg* r) @@ -923,6 +928,16 @@ void GSState::GIFRegHandlerBITBLTBUF(GIFReg* r) } m_env.BITBLTBUF = r->BITBLTBUF; + + if((m_env.BITBLTBUF.SBW & 1) && (m_env.BITBLTBUF.SPSM == PSM_PSMT8 || m_env.BITBLTBUF.SPSM == PSM_PSMT4)) + { + m_env.BITBLTBUF.SBW &= ~1; + } + + if((m_env.BITBLTBUF.DBW & 1) && (m_env.BITBLTBUF.DPSM == PSM_PSMT8 || m_env.BITBLTBUF.DPSM == PSM_PSMT4)) + { + m_env.BITBLTBUF.DBW &= ~1; // namcoXcapcom: 5, 11, refered to as 4, 10 in TEX0.TBW later + } } void GSState::GIFRegHandlerTRXPOS(GIFReg* r) @@ -1059,24 +1074,14 @@ void GSState::FlushWrite(BYTE* mem, int len) void GSState::Write(BYTE* mem, int len) { -/**/ +/* TRACE(_T("Write len=%d DBP=%05x DBW=%d DPSM=%d DSAX=%d DSAY=%d RRW=%d RRH=%d\n"), len, (int)m_env.BITBLTBUF.DBP, (int)m_env.BITBLTBUF.DBW, (int)m_env.BITBLTBUF.DPSM, (int)m_env.TRXPOS.DSAX, (int)m_env.TRXPOS.DSAY, (int)m_env.TRXREG.RRW, (int)m_env.TRXREG.RRH); - +*/ if(len == 0) return; - if(m_game.title == CRC::NamcoXCapcom) - { - - if(m_env.BITBLTBUF.DBP == 0x03018 && m_env.BITBLTBUF.DBW == 11 && m_env.BITBLTBUF.DPSM == PSM_PSMT8 - || m_env.BITBLTBUF.DBP == 0x03b80 && m_env.BITBLTBUF.DBW == 5 && m_env.BITBLTBUF.DPSM == PSM_PSMT8) - { - m_env.BITBLTBUF.DBW--; // WTF - } - } - if(m_y >= m_env.TRXREG.RRH) return; // TODO: handle overflow during writing data too (just chop len below somewhere) // TODO: hmmmm diff --git a/gsdx/GSVector.h b/gsdx/GSVector.h index 398435f..19479e8 100644 --- a/gsdx/GSVector.h +++ b/gsdx/GSVector.h @@ -145,7 +145,7 @@ public: UINT32 rgba32() const { __m128i r = m; - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 r = _mm_packus_epi32(r, r); #else r = _mm_packs_epi32(r, r); // good enough for colors... @@ -157,7 +157,7 @@ public: UINT64 rgba64() const { __m128i r = m; - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 r = _mm_packus_epi32(r, r); #else r = _mm_packs_epi32(r, r); // good enough for colors... @@ -169,7 +169,7 @@ public: #endif } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i sat_i8(const GSVector4i& a, const GSVector4i& b) const { return GSVector4i(_mm_min_epi8(_mm_max_epi8(m, a), b)); @@ -181,7 +181,7 @@ public: return GSVector4i(_mm_min_epi16(_mm_max_epi16(m, a), b)); } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i sat_i32(const GSVector4i& a, const GSVector4i& b) const { return GSVector4i(_mm_min_epi32(_mm_max_epi32(m, a), b)); @@ -193,14 +193,14 @@ public: return GSVector4i(_mm_min_epu8(_mm_max_epu8(m, a), b)); } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i sat_u16(const GSVector4i& a, const GSVector4i& b) const { return GSVector4i(_mm_min_epu16(_mm_max_epu16(m, a), b)); } #endif - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i sat_u32(const GSVector4i& a, const GSVector4i& b) const { return GSVector4i(_mm_min_epu32(_mm_max_epu32(m, a), b)); @@ -212,7 +212,7 @@ public: return GSVector4i(_mm_blendv_epi8(m, a, mask)); } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 template GSVector4i blend16(const GSVector4i& a) const { return GSVector4i(_mm_blend_epi16(m, a, mask)); @@ -246,7 +246,7 @@ public: return GSVector4i(_mm_packs_epi32(m, a)); } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 GSVector4i pu32(const GSVector4i& a) const { return GSVector4i(_mm_packus_epi32(m, a)); @@ -403,6 +403,58 @@ public: return GSVector4i(_mm_srli_epi64(m, i)); } + GSVector4i add8(const GSVector4i& v) const + { + return GSVector4i(_mm_add_epi8(m, v.m)); + } + + GSVector4i add16(const GSVector4i& v) const + { + return GSVector4i(_mm_add_epi16(m, v.m)); + } + + GSVector4i add32(const GSVector4i& v) const + { + return GSVector4i(_mm_add_epi32(m, v.m)); + } + + GSVector4i sub8(const GSVector4i& v) const + { + return GSVector4i(_mm_sub_epi8(m, v.m)); + } + + GSVector4i sub16(const GSVector4i& v) const + { + return GSVector4i(_mm_sub_epi16(m, v.m)); + } + + GSVector4i sub32(const GSVector4i& v) const + { + return GSVector4i(_mm_sub_epi32(m, v.m)); + } + + GSVector4i mul16hs(const GSVector4i& v) const + { + return GSVector4i(_mm_mulhi_epi16(m, v.m)); + } + + GSVector4i mul16hu(const GSVector4i& v) const + { + return GSVector4i(_mm_mulhi_epu16(m, v.m)); + } + + GSVector4i mul16l(const GSVector4i& v) const + { + return GSVector4i(_mm_mullo_epi16(m, v.m)); + } + + #if _M_SSE >= 0x301 + GSVector4i mul16hrs(const GSVector4i& v) const + { + return GSVector4i(_mm_mulhrs_epi16(m, v.m)); + } + #endif + GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); @@ -413,7 +465,17 @@ public: return _mm_movemask_epi8(m); } - #if _M_SSE >= 0x400 + template GSVector4i insert16(int a) const + { + return GSVector4i(_mm_insert_epi16(m, a, i)); + } + + template int extract16() const + { + return _mm_extract_epi16(m, i); + } + + #if _M_SSE >= 0x401 template GSVector4i insert8(int a) const { @@ -425,16 +487,6 @@ public: return _mm_extract_epi8(m, i); } - template GSVector4i insert16(int a) const - { - return GSVector4i(_mm_insert_epi16(m, a, i)); - } - - template int extract16() const - { - return _mm_extract_epi16(m, i); - } - template GSVector4i insert32(int a) const { return GSVector4i(_mm_insert_epi32(m, a, i)); @@ -862,7 +914,7 @@ public: return GSVector4i(0) == GSVector4i(0); } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); @@ -919,6 +971,11 @@ public: } #endif + static void storent(void* p, const GSVector4i& v) + { + _mm_stream_si128((__m128i*)p, v.m); + } + static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); @@ -940,7 +997,7 @@ public: GSVector4i::storeh(ph, v); } - template static void store(const void* p, const GSVector4i& v) + template static void store(void* p, const GSVector4i& v) { if(aligned) _mm_store_si128((__m128i*)p, v.m); else _mm_storeu_si128((__m128i*)p, v.m); @@ -1186,6 +1243,11 @@ public: return (v1 < v2) | (v1 == v2); } + template GSVector4i shuffle() const + { + return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(i, i, i, i))); + } + #define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ @@ -1392,7 +1454,7 @@ public: #endif } - #if _M_SSE >= 0x400 + #if _M_SSE >= 0x401 template GSVector4 dp(const GSVector4& v) const { return GSVector4(_mm_dp_ps(m, v.m, i)); @@ -1434,6 +1496,16 @@ public: return GSVector4(_mm_unpackhi_ps(m, a)); } + GSVector4 l2h(const GSVector4& a) const + { + return GSVector4(_mm_movelh_ps(m, a)); + } + + GSVector4 h2l(const GSVector4& a) const + { + return GSVector4(_mm_movehl_ps(m, a)); + } + GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); @@ -1461,8 +1533,26 @@ public: __forceinline static void transpose(GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d) { - _MM_TRANSPOSE4_PS(a.m, b.m, c.m, d.m); - } + GSVector4 v0 = a.xyxy(b); + GSVector4 v1 = c.xyxy(d); + GSVector4 v2 = a.zwzw(b); + GSVector4 v3 = c.zwzw(d); + + a = v0.xzxz(v1); + b = v0.ywyw(v1); + c = v2.xzxz(v3); + d = v2.ywyw(v3); +/* + GSVector4 v0 = a.upl(b); + GSVector4 v1 = a.uph(b); + GSVector4 v2 = c.upl(d); + GSVector4 v3 = c.uph(d); + + a = v0.l2h(v2); + b = v2.h2l(v0); + c = v1.l2h(v3); + d = v3.h2l(v1); +*/ } void operator += (const GSVector4& v) { @@ -1604,6 +1694,11 @@ public: return GSVector4(_mm_cmple_ps(v1, v2)); } + template GSVector4 shuffle() const + { + return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i))); + } + #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ GSVector4 xs##ys##zs##ws() const {return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ GSVector4 xs##ys##zs##ws(const GSVector4& v) const {return GSVector4(_mm_shuffle_ps(m, v.m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ diff --git a/gsdx/GSdx.cpp b/gsdx/GSdx.cpp index f36daec..e309439 100644 --- a/gsdx/GSdx.cpp +++ b/gsdx/GSdx.cpp @@ -105,6 +105,30 @@ BOOL GSdxApp::InitInstance() return TRUE; } +static bool CheckSSE() +{ + __try + { + static __m128i m; + + #if _M_SSE >= 0x402 + m.m128i_i32[0] = _mm_popcnt_u32(1234); + #elif _M_SSE >= 0x401 + m = _mm_packus_epi32(m, m); + #elif _M_SSE >= 0x301 + m = _mm_alignr_epi8(m, m, 1); + #elif _M_SSE >= 0x200 + m = _mm_packs_epi32(m, m); + #endif + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + + return true; +} + // #define PS2E_LT_GS 0x01 @@ -137,8 +161,10 @@ EXPORT_C_(char*) PS2EgetLibName() sl.AddTail(s); #endif -#if _M_SSE >= 0x400 - sl.AddTail(_T("SSE4")); +#if _M_SSE >= 0x402 + sl.AddTail(_T("SSE42")); +#elif _M_SSE >= 0x401 + sl.AddTail(_T("SSE41")); #elif _M_SSE >= 0x301 sl.AddTail(_T("SSSE3")); #elif _M_SSE >= 0x200 @@ -221,6 +247,8 @@ static INT32 GSopen(void* dsp, char* title, int mt, int renderer) { AFX_MANAGE_STATE(AfxGetStaticModuleState()); + // + CString str; str.Format(_T("d3dx9_%d.dll"), D3DX_SDK_VERSION); @@ -241,6 +269,18 @@ static INT32 GSopen(void* dsp, char* title, int mt, int renderer) return -1; } + // + + if(!CheckSSE()) + { + CString str; + str.Format(_T("This CPU does not support SSE %d.%02d"), _M_SSE >> 8, _M_SSE & 0xff); + AfxMessageBox(str, MB_OK); + return -1; + } + + // + GSclose(); // TODO @@ -555,6 +595,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow // + //for(int tbw = 5; tbw <= 10; tbw++) for(int tbw = 5; tbw <= 10; tbw++) { int n = 256 << ((10 - tbw) * 2); diff --git a/gsdx/GSdx_vs2008.vcproj b/gsdx/GSdx_vs2008.vcproj index 118e928..798acc4 100644 --- a/gsdx/GSdx_vs2008.vcproj +++ b/gsdx/GSdx_vs2008.vcproj @@ -788,6 +788,7 @@ = 0x400 +#if _M_SSE >= 0x401 #include diff --git a/sse4.vsprops b/sse4.vsprops index 5bdfcc5..601b39f 100644 --- a/sse4.vsprops +++ b/sse4.vsprops @@ -6,7 +6,7 @@ > diff --git a/xpad/xpad_vs2008.vcproj b/xpad/xpad_vs2008.vcproj index 6f4f393..1089375 100644 --- a/xpad/xpad_vs2008.vcproj +++ b/xpad/xpad_vs2008.vcproj @@ -788,6 +788,7 @@