From f73c4d734d2527804e8b3dd940203e695fe0176b Mon Sep 17 00:00:00 2001 From: gabest Date: Fri, 5 Dec 2008 23:00:44 +0000 Subject: [PATCH] --- gsdx/GPUDrawScanline.cpp | 6 +- gsdx/GSDrawScanline.cpp | 644 +++++++++++++++++++++++++-------------- gsdx/GSDrawScanline.h | 20 +- gsdx/GSRendererSW.cpp | 2 +- gsdx/GSRendererSW.h | 9 +- gsdx/GSVector.h | 63 ++++ 6 files changed, 494 insertions(+), 250 deletions(-) diff --git a/gsdx/GPUDrawScanline.cpp b/gsdx/GPUDrawScanline.cpp index ae42f68..8f30031 100644 --- a/gsdx/GPUDrawScanline.cpp +++ b/gsdx/GPUDrawScanline.cpp @@ -340,9 +340,9 @@ void GPUDrawScanline::ColorTFX(DWORD tfx, const GSVector4i& r, const GSVector4i& c[2] = b.srl16(7); break; case 2: // modulate (tfx = tme | tge) - c[0] = c[0].sll16(2).mul16hu(r).pu16().upl8(); - c[1] = c[1].sll16(2).mul16hu(g).pu16().upl8(); - c[2] = c[2].sll16(2).mul16hu(b).pu16().upl8(); + c[0] = c[0].sll16(2).mul16hu(r).clamp8(); + c[1] = c[1].sll16(2).mul16hu(g).clamp8(); + c[2] = c[2].sll16(2).mul16hu(b).clamp8(); break; case 3: // decal (tfx = tme) break; diff --git a/gsdx/GSDrawScanline.cpp b/gsdx/GSDrawScanline.cpp index c9b4650..bd1025b 100644 --- a/gsdx/GSDrawScanline.cpp +++ b/gsdx/GSDrawScanline.cpp @@ -86,7 +86,7 @@ void GSDrawScanline::SetupDraw(Vertex* vertices, int count, const void* texture) { // if q is constant we can do the half pel shift for bilinear sampling on the vertices - GSVector4 half(0.5f, 0.5f, 0.0f, 0.0f); + GSVector4 half(2048.0f, 2048.0f, 0.0f, 0.0f); for(int i = 0; i < count; i++) { @@ -171,9 +171,10 @@ void GSDrawScanline::SetupDraw(Vertex* vertices, int count, const void* texture) m_slenv.colclamp = GSVector4i(env.COLCLAMP.CLAMP ? 0xffffffff : 0x00ff00ff); m_slenv.fba = GSVector4i(context->FBA.FBA ? 0x80000000 : 0); m_slenv.aref = GSVector4i((int)context->TEST.AREF + (m_sel.atst == ATST_LESS ? -1 : m_sel.atst == ATST_GREATER ? +1 : 0)); - m_slenv.afix = GSVector4((float)(int)context->ALPHA.FIX); - m_slenv.afix2 = m_slenv.afix * (2.0f / 256); - m_slenv.fc = GSVector4((DWORD)env.FOGCOL.ai32[0]); + m_slenv.afix = GSVector4i((int)context->ALPHA.FIX << 16); + m_slenv.afix2 = m_slenv.afix << 1; + m_slenv.frb = GSVector4i((int)env.FOGCOL.ai32[0] & 0x00ff00ff).xxxx(); + m_slenv.fga = GSVector4i((int)(env.FOGCOL.ai32[0] >> 8) & 0x00ff00ff).xxxx(); if(m_sel.fpsm == 1) { @@ -251,20 +252,39 @@ void GSDrawScanline::SetupDraw(Vertex* vertices, int count, const void* texture) void GSDrawScanline::SetupScanline(const Vertex& dv) { + GSVector4 ps0123 = GSVector4::ps0123(); + GSVector4 dp = dv.p; - - m_slenv.dp = dp; - m_slenv.dp4 = dp * 4.0; - GSVector4 dt = dv.t; + GSVector4 dc = dv.c; + + // p + + GSVector4 dz = dp.zzzz(); + + m_slenv.dz = dz * ps0123; + m_slenv.dz4 = dz * 4.0; + + GSVector4 df = dp.wwww(); + GSVector4i dfi = GSVector4i(df * ps0123).ps32(GSVector4i(df * 4.0f)); + + m_slenv.df = dfi.upl16(dfi); + m_slenv.df4 = dfi.uph16(dfi); + + // t m_slenv.dt = dt; m_slenv.dt4 = dt * 4.0; - GSVector4 dc = dv.c; + // c - m_slenv.dc = dc; - m_slenv.dc4 = dc * 4.0; + GSVector4i drg = GSVector4i(dc.xxxx() * ps0123).ps32(GSVector4i(dc.yyyy() * ps0123)); + GSVector4i dba = GSVector4i(dc.zzzz() * ps0123).ps32(GSVector4i(dc.wwww() * ps0123)); + GSVector4i dc4 = GSVector4i(dc * 4.0f).ps32().xzywl(); + + m_slenv.drb = drg.upl16(dba); + m_slenv.dga = drg.uph16(dba); + m_slenv.dc4 = dc4; } void GSDrawScanline::DrawScanline(int top, int left, int right, const Vertex& v) @@ -290,7 +310,7 @@ void GSDrawScanline::FillRect(const GSVector4i& r, const Vertex& v) m_state->m_mem.FillRect(r, (DWORD)(float)v.p.z, zpsm, zbp, bw); } - DWORD c = v.c.rgba32(); + DWORD c = (GSVector4i(v.c) >> 7).rgba32(); if(context->FBA.FBA) { @@ -380,24 +400,53 @@ GSVector4i GSDrawScanline::Wrap(const GSVector4i& t) return clamp.blend8(repeat, m_slenv.t.mask); } -void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, const GSVector4i& test, const GSVector4& u, const GSVector4& v, GSVector4* c) +void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, const GSVector4i& test, const GSVector4& u, const GSVector4& v, GSVector4i* c) { const void* RESTRICT tex = m_slenv.tex; const DWORD* RESTRICT clut = m_slenv.clut; const DWORD tw = m_slenv.tw; + GSVector4i ui = GSVector4i(u); + GSVector4i vi = GSVector4i(v); + + GSVector4i uv = (ui.sra32(12)).ps32(vi.sra32(12)); + + GSVector4i uv0, uv1; + GSVector4i addr00, addr01, addr10, addr11; + GSVector4i c00, c01, c10, c11; + if(ltf) { - GSVector4 uf = u.floor(); - GSVector4 vf = v.floor(); - - GSVector4 uff = u - uf; - GSVector4 vff = v - vf; + GSVector4i uvf = (ui & GSVector4i::x00000fff()).ps32(vi & GSVector4i::x00000fff()); + GSVector4i uf = uvf.upl16(uvf); + GSVector4i vf = uvf.uph16(uvf); - GSVector4i uv = GSVector4i(uf).ps32(GSVector4i(vf)); + uv0 = Wrap(uv); + uv1 = Wrap(uv.add16(GSVector4i::x0001())); - GSVector4i uv0 = Wrap(uv); - GSVector4i uv1 = Wrap(uv + GSVector4i::x0001(uv)); + addr00 = (uv0.uph16() << tw) + uv0.upl16(); + addr01 = (uv0.uph16() << tw) + uv1.upl16(); + addr10 = (uv1.uph16() << tw) + uv0.upl16(); + addr11 = (uv1.uph16() << tw) + uv1.upl16(); + + #if _M_SSE >= 0x401 + + if(tlu) + { + c00 = addr00.gather32_32((const BYTE*)tex).gather32_32(clut); + c01 = addr01.gather32_32((const BYTE*)tex).gather32_32(clut); + c10 = addr10.gather32_32((const BYTE*)tex).gather32_32(clut); + c11 = addr11.gather32_32((const BYTE*)tex).gather32_32(clut); + } + else + { + c00 = addr00.gather32_32((const DWORD*)tex); + c01 = addr01.gather32_32((const DWORD*)tex); + c10 = addr10.gather32_32((const DWORD*)tex); + c11 = addr11.gather32_32((const DWORD*)tex); + } + + #else int i = 0; @@ -410,16 +459,10 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, continue; } - GSVector4 c00 = GSVector4(clut[((const BYTE*)tex)[(uv0.u16[i + 4] << tw) + uv0.u16[i]]]); - GSVector4 c01 = GSVector4(clut[((const BYTE*)tex)[(uv0.u16[i + 4] << tw) + uv1.u16[i]]]); - GSVector4 c10 = GSVector4(clut[((const BYTE*)tex)[(uv1.u16[i + 4] << tw) + uv0.u16[i]]]); - GSVector4 c11 = GSVector4(clut[((const BYTE*)tex)[(uv1.u16[i + 4] << tw) + uv1.u16[i]]]); - - c00 = c00.lerp(c01, uff.v[i]); - c10 = c10.lerp(c11, uff.v[i]); - c00 = c00.lerp(c10, vff.v[i]); - - c[i] = c00; + c00.u32[i] = clut[((const BYTE*)tex)[addr00.u32[i]]]; + c01.u32[i] = clut[((const BYTE*)tex)[addr01.u32[i]]]; + c10.u32[i] = clut[((const BYTE*)tex)[addr10.u32[i]]]; + c11.u32[i] = clut[((const BYTE*)tex)[addr11.u32[i]]]; } while(++i < pixels); } @@ -432,27 +475,55 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, continue; } - GSVector4 c00 = GSVector4(((const DWORD*)tex)[(uv0.u16[i + 4] << tw) + uv0.u16[i]]); - GSVector4 c01 = GSVector4(((const DWORD*)tex)[(uv0.u16[i + 4] << tw) + uv1.u16[i]]); - GSVector4 c10 = GSVector4(((const DWORD*)tex)[(uv1.u16[i + 4] << tw) + uv0.u16[i]]); - GSVector4 c11 = GSVector4(((const DWORD*)tex)[(uv1.u16[i + 4] << tw) + uv1.u16[i]]); - - c00 = c00.lerp(c01, uff.v[i]); - c10 = c10.lerp(c11, uff.v[i]); - c00 = c00.lerp(c10, vff.v[i]); - - c[i] = c00; + c00.u32[i] = ((const DWORD*)tex)[addr00.u32[i]]; + c01.u32[i] = ((const DWORD*)tex)[addr01.u32[i]]; + c10.u32[i] = ((const DWORD*)tex)[addr10.u32[i]]; + c11.u32[i] = ((const DWORD*)tex)[addr11.u32[i]]; } while(++i < pixels); } - GSVector4::transpose(c[0], c[1], c[2], c[3]); + #endif + + GSVector4i rb00 = (c00 & GSVector4i::x00ff()); + GSVector4i rb01 = (c01 & GSVector4i::x00ff()); + GSVector4i rb10 = (c10 & GSVector4i::x00ff()); + GSVector4i rb11 = (c11 & GSVector4i::x00ff()); + + GSVector4i ga00 = (c00 & GSVector4i::xff00()) >> 8; + GSVector4i ga01 = (c01 & GSVector4i::xff00()) >> 8; + GSVector4i ga10 = (c10 & GSVector4i::xff00()) >> 8; + GSVector4i ga11 = (c11 & GSVector4i::xff00()) >> 8; + + rb00 = rb00.add16(rb01.sub16(rb00).sll16(4).mul16hs(uf)); + rb10 = rb10.add16(rb11.sub16(rb10).sll16(4).mul16hs(uf)); + rb00 = rb00.add16(rb10.sub16(rb00).sll16(4).mul16hs(vf)); + + ga00 = ga00.add16(ga01.sub16(ga00).sll16(4).mul16hs(uf)); + ga10 = ga10.add16(ga11.sub16(ga10).sll16(4).mul16hs(uf)); + ga00 = ga00.add16(ga10.sub16(ga00).sll16(4).mul16hs(vf)); + + c[0] = rb00; + c[1] = ga00; } else { - GSVector4i uv = Wrap(GSVector4i(u).ps32(GSVector4i(v))); + uv0 = Wrap(uv); - GSVector4i c00; + addr00 = (uv0.uph16() << tw) + uv0.upl16(); + + #if _M_SSE >= 0x401 + + if(tlu) + { + c00 = addr00.gather32_32((const BYTE*)tex).gather32_32(clut); + } + else + { + c00 = addr00.gather32_32((const DWORD*)tex); + } + + #else int i = 0; @@ -465,7 +536,7 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, continue; } - c00.u32[i] = clut[((const BYTE*)tex)[(uv.u16[i + 4] << tw) + uv.u16[i]]]; + c00.u32[i] = clut[((const BYTE*)tex)[addr00.u32[i]]]; } while(++i < pixels); } @@ -478,73 +549,128 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, continue; } - c00.u32[i] = ((const DWORD*)tex)[(uv.u16[i + 4] << tw) + uv.u16[i]]; + c00.u32[i] = ((const DWORD*)tex)[addr00.u32[i]]; } while(++i < pixels); } - // GSVector4::expand(c00, c[0], c[1], c[2], c[3]); + #endif - c[0] = (c00 << 24) >> 24; - c[1] = (c00 << 16) >> 24; - c[2] = (c00 << 8) >> 24; - c[3] = (c00 >> 24); + c[0] = c00 & 0x00ff00ff; + c[1] = (c00 >> 8) & 0x00ff00ff; } } -void GSDrawScanline::ColorTFX(DWORD tfx, const GSVector4& rf, const GSVector4& gf, const GSVector4& bf, const GSVector4& af, GSVector4& rt, GSVector4& gt, GSVector4& bt) +void GSDrawScanline::ColorTFX(DWORD tfx, const GSVector4i& rbf, const GSVector4i& gaf, GSVector4i& rbt, GSVector4i& gat) { + GSVector4i af; + switch(tfx) { case TFX_MODULATE: - rt = rt.mod2x(rf); - gt = gt.mod2x(gf); - bt = bt.mod2x(bf); - rt = rt.clamp(); - gt = gt.clamp(); - bt = bt.clamp(); + rbt = rbt.sll16(2).mul16hu(rbf).clamp8(); break; case TFX_DECAL: break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: - rt = rt.mod2x(rf) + af; - gt = gt.mod2x(gf) + af; - bt = bt.mod2x(bf) + af; - rt = rt.clamp(); - gt = gt.clamp(); - bt = bt.clamp(); + af = gaf.srl16(7).yywwl().yywwh(); + rbt = rbt.sll16(2).mul16hu(rbf).add16(af).clamp8(); + gat = gat.mix16(gat.sll16(2).mul16hu(gaf).add16(af).clamp8()); break; case TFX_NONE: - rt = rf; - gt = gf; - bt = bf; + rbt = rbf.srl16(7); break; default: __assume(0); } +/* + GSVector4i rb, ga, c, a, aa; + + switch(tfx) + { + case TFX_MODULATE: + rb = rbt.sll16(2).mul16hu(rbf); + ga = gat.sll16(2).mul16hu(gaf); + c = rb.pu16(ga); + rb = c.upl8(); + ga = c.uph8(); + if(!tcc) ga = gaf.srl16(7).mix16(ga); + break; + case TFX_DECAL: + return; + case TFX_HIGHLIGHT: + rb = rbt.sll16(2).mul16hu(rbf); + ga = gat.sll16(2).mul16hu(gaf); + c = rb.pu16(ga); + rb = c.upl8(); + ga = c.uph8(); + a = gaf.srl16(7); + aa = a.yywwl().yywwh(); + rb = rb.addus8(aa); + ga = ga.addus8(aa); + if(tcc) a = a.addus8(gat); + ga = a.mix16(ga); + break; + case TFX_HIGHLIGHT2: + rb = rbt.sll16(2).mul16hu(rbf); + ga = gat.sll16(2).mul16hu(gaf); + c = rb.pu16(ga); + rb = c.upl8(); + ga = c.uph8(); + a = gaf.srl16(7); + aa = a.yywwl().yywwh(); + rb = rb.addus8(aa); + ga = ga.addus8(aa); + if(!tcc) ga = a.mix16(ga); + break; + case TFX_NONE: + rb = rbf.srl16(7); + ga = gaf.srl16(7); + break; + default: + __assume(0); + } + + rbt = rb; + gat = ga; +*/ } -void GSDrawScanline::AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4& af, GSVector4& at) +void GSDrawScanline::AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4i& gaf, GSVector4i& gat) { switch(tfx) { - case TFX_MODULATE: at = tcc ? at.mod2x(af).clamp() : af; break; - case TFX_DECAL: break; - case TFX_HIGHLIGHT: at = tcc ? (at + af).clamp() : af; break; - case TFX_HIGHLIGHT2: if(!tcc) at = af; break; - case TFX_NONE: at = af; break; - default: __assume(0); + case TFX_MODULATE: + gat = gat.sll16(2).mul16hu(gaf).clamp8(); + if(!tcc) gat = gaf.srl16(7).mix16(gat); + break; + case TFX_DECAL: + break; + case TFX_HIGHLIGHT: + if(!tcc) gat = gaf.srl16(7); + else gat = gat.addus8(gaf.srl16(7)); + break; + case TFX_HIGHLIGHT2: + if(!tcc) gat = gaf.srl16(7); + break; + case TFX_NONE: + gat = gaf.srl16(7); + break; + default: + __assume(0); } } -void GSDrawScanline::Fog(const GSVector4& f, GSVector4& r, GSVector4& g, GSVector4& b) +void GSDrawScanline::Fog(const GSVector4i& f, GSVector4i& rb, GSVector4i& ga) { - GSVector4 fc = m_slenv.fc; + GSVector4i frb = m_slenv.frb; + GSVector4i fga = m_slenv.fga; - r = fc.xxxx().lerp(r, f); - g = fc.yyyy().lerp(g, f); - b = fc.zzzz().lerp(b, f); + GSVector4i fog = f.srl16(3); + + rb = frb.add16(rb.sub16(frb).sll16(4).mul16hs(fog)); + ga = ga.mix16(fga.add16(ga.sub16(fga).sll16(4).mul16hs(fog))); } bool GSDrawScanline::TestZ(DWORD zpsm, DWORD ztst, const GSVector4i& zs, const GSVector4i& za, GSVector4i& test) @@ -584,7 +710,7 @@ bool GSDrawScanline::TestZ(DWORD zpsm, DWORD ztst, const GSVector4i& zs, const G return true; } -bool GSDrawScanline::TestAlpha(DWORD atst, DWORD afail, const GSVector4& a, GSVector4i& fm, GSVector4i& zm, GSVector4i& test) +bool GSDrawScanline::TestAlpha(DWORD atst, DWORD afail, const GSVector4i& ga, GSVector4i& fm, GSVector4i& zm, GSVector4i& test) { if(atst != 1) { @@ -595,11 +721,11 @@ bool GSDrawScanline::TestAlpha(DWORD atst, DWORD afail, const GSVector4& a, GSVe case ATST_NEVER: t = GSVector4i::invzero(); break; case ATST_ALWAYS: t = GSVector4i::zero(); break; case ATST_LESS: - case ATST_LEQUAL: t = GSVector4i(a) > m_slenv.aref; break; - case ATST_EQUAL: t = GSVector4i(a) != m_slenv.aref; break; + case ATST_LEQUAL: t = (ga >> 16) > m_slenv.aref; break; + case ATST_EQUAL: t = (ga >> 16) != m_slenv.aref; break; case ATST_GEQUAL: - case ATST_GREATER: t = GSVector4i(a) < m_slenv.aref; break; - case ATST_NOTEQUAL: t = GSVector4i(a) == m_slenv.aref; break; + case ATST_GREATER: t = (ga >> 16) < m_slenv.aref; break; + case ATST_NOTEQUAL: t = (ga >> 16) == m_slenv.aref; break; default: __assume(0); } @@ -1810,6 +1936,11 @@ void GSDrawScanline::Init() m_dsmap.SetAt(0xe440c5e8, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0xe440c5e8>); m_dsmap.SetAt(0xe440cc18, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0xe440cc18>); m_dsmap.SetAt(0xe440d5f8, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0xe440d5f8>); + m_dsmap.SetAt(0x0ff0c238, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0x0ff0c238>); + m_dsmap.SetAt(0x6440c218, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0x6440c218>); + m_dsmap.SetAt(0x6640c218, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0x6640c218>); + m_dsmap.SetAt(0x8ff0d438, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0x8ff0d438>); + m_dsmap.SetAt(0xe4403c18, (DrawScanlinePtr)&GSDrawScanline::DrawScanlineExT<0xe4403c18>); // tokyo bus guide @@ -2176,31 +2307,59 @@ void GSDrawScanline::Init() template void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v) { - GSVector4i fa_base = m_slenv.fbr[top]; - GSVector4i* fa_offset = (GSVector4i*)&m_slenv.fbc[left & 3][left]; + const GSVector4 ps0123 = GSVector4::ps0123(); - GSVector4i za_base = m_slenv.zbr[top]; - GSVector4i* za_offset = (GSVector4i*)&m_slenv.zbc[left & 3][left]; + GSVector4i fa_base; + GSVector4i* fa_offset; + + GSVector4i za_base; + GSVector4i* za_offset; - GSVector4 ps0123 = GSVector4::ps0123(); + GSVector4 z, s, t, q; + GSVector4i f, rb, ga; + + // fa + + fa_base = m_slenv.fbr[top]; + fa_offset = (GSVector4i*)&m_slenv.fbc[left & 3][left]; + + // za + + za_base = m_slenv.zbr[top]; + za_offset = (GSVector4i*)&m_slenv.zbc[left & 3][left]; + + // v.p GSVector4 vp = v.p; - GSVector4 dp = m_slenv.dp; - GSVector4 z = vp.zzzz(); z += dp.zzzz() * ps0123; - GSVector4 f = vp.wwww(); f += dp.wwww() * ps0123; + + z = vp.zzzz() + m_slenv.dz; + f = GSVector4i(vp).zzzzh().zzzz().add16(m_slenv.df); + + // v.t GSVector4 vt = v.t; GSVector4 dt = m_slenv.dt; - GSVector4 s = vt.xxxx(); s += dt.xxxx() * ps0123; - GSVector4 t = vt.yyyy(); t += dt.yyyy() * ps0123; - GSVector4 q = vt.zzzz(); q += dt.zzzz() * ps0123; + + s = vt.xxxx() + dt.xxxx() * ps0123; + t = vt.yyyy() + dt.yyyy() * ps0123; + q = vt.zzzz() + dt.zzzz() * ps0123; + + // v.c - GSVector4 vc = v.c; - GSVector4 dc = m_slenv.dc; - GSVector4 r = vc.xxxx(); if(iip) r += dc.xxxx() * ps0123; - GSVector4 g = vc.yyyy(); if(iip) g += dc.yyyy() * ps0123; - GSVector4 b = vc.zzzz(); if(iip) b += dc.zzzz() * ps0123; - GSVector4 a = vc.wwww(); if(iip) a += dc.wwww() * ps0123; + GSVector4i vc = GSVector4i(v.c); + + vc = vc.upl16(vc.zwxy()); + + rb = vc.xxxx(); + ga = vc.zzzz(); + + if(iip) + { + rb = rb.add16(m_slenv.drb); + ga = ga.add16(m_slenv.dga); + } + + // int steps = right - left; @@ -2221,7 +2380,7 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v int pixels = GSVector4i::min_i16(steps, 4); - GSVector4 c[12]; + GSVector4i c[6]; if(m_sel.tfx != TFX_NONE) { @@ -2237,29 +2396,29 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v if(m_sel.ltf) { - u -= 0.5f; - v -= 0.5f; + u -= 2048.0f; + v -= 2048.5f; } } SampleTexture(pixels, ztst, m_sel.ltf, m_sel.tlu, test, u, v, c); } - AlphaTFX(m_sel.tfx, m_sel.tcc, a, c[3]); + AlphaTFX(m_sel.tfx, m_sel.tcc, ga, c[1]); GSVector4i fm = m_slenv.fm; GSVector4i zm = m_slenv.zm; - if(!TestAlpha(m_sel.atst, m_sel.afail, c[3], fm, zm, test)) + if(!TestAlpha(m_sel.atst, m_sel.afail, c[1], fm, zm, test)) { continue; } - ColorTFX(m_sel.tfx, r, g, b, a, c[0], c[1], c[2]); + ColorTFX(m_sel.tfx, rb, ga, c[0], c[1]); if(m_sel.fge) { - Fog(f, c[0], c[1], c[2]); + Fog(f, c[0], c[1]); } GSVector4i fa = fa_base + GSVector4i::load(fa_offset); @@ -2286,55 +2445,46 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v if(m_sel.abe != 255) { - // GSVector4::expand(d, c[4], c[5], c[6], c[7]); - - c[4] = (d << 24) >> 24; - c[5] = (d << 16) >> 24; - c[6] = (d << 8) >> 24; - c[7] = (d >> 24); + c[2] = d & 0x00ff00ff; + c[3] = (d >> 8) & 0x00ff00ff; if(fpsm == 1) { - c[7] = GSVector4(128.0f); + c[3] = GSVector4i(0x00800000).mix16(c[3]); } - c[8] = GSVector4::zero(); - c[9] = GSVector4::zero(); - c[10] = GSVector4::zero(); - c[11] = m_slenv.afix; + c[4] = GSVector4::zero(); + c[5] = m_slenv.afix; DWORD abea = m_sel.abea; DWORD abeb = m_sel.abeb; DWORD abec = m_sel.abec; DWORD abed = m_sel.abed; - GSVector4 r = (c[abea*4 + 0] - c[abeb*4 + 0]).mod2x(c[abec*4 + 3]) + c[abed*4 + 0]; - GSVector4 g = (c[abea*4 + 1] - c[abeb*4 + 1]).mod2x(c[abec*4 + 3]) + c[abed*4 + 1]; - GSVector4 b = (c[abea*4 + 2] - c[abeb*4 + 2]).mod2x(c[abec*4 + 3]) + c[abed*4 + 2]; + GSVector4i a = c[abec * 2 + 1].yywwl().yywwh().sll16(5); + + GSVector4i drb = c[abea * 2 + 0].sub16(c[abeb * 2 + 0]); + GSVector4i dga = c[abea * 2 + 1].sub16(c[abeb * 2 + 1]); + + GSVector4i rb = drb.sll16(4).mul16hs(a).add16(c[abed * 2 + 0]); + GSVector4i ga = dga.sll16(4).mul16hs(a).add16(c[abed * 2 + 1]); if(m_sel.pabe) { - GSVector4 mask = c[3] >= GSVector4(128.0f); + GSVector4i mask = (c[1] << 8).sra32(31); - c[0] = c[0].blend8(r, mask); - c[1] = c[1].blend8(g, mask); - c[2] = c[2].blend8(b, mask); - } - else - { - c[0] = r; - c[1] = g; - c[2] = b; + rb = c[0].blend8(rb, mask); + ga = c[1].blend8(ga, mask); } + + c[0] = rb; + c[1] = c[1].mix16(ga); } - GSVector4i rb = GSVector4i(c[0]).ps32(GSVector4i(c[2])); - GSVector4i ga = GSVector4i(c[1]).ps32(GSVector4i(c[3])); - - GSVector4i rg = rb.upl16(ga) & m_slenv.colclamp; - GSVector4i ba = rb.uph16(ga) & m_slenv.colclamp; - - GSVector4i s = rg.upl32(ba).pu16(rg.uph32(ba)); + c[0] &= m_slenv.colclamp; + c[1] &= m_slenv.colclamp; + + GSVector4i s = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); if(fpsm != 1) { @@ -2357,10 +2507,8 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v fa_offset++; za_offset++; - GSVector4 dp4 = m_slenv.dp4; - - z += dp4.zzzz(); - f += dp4.wwww(); + z += m_slenv.dz4; + f = f.add16(m_slenv.df4); GSVector4 dt4 = m_slenv.dt4; @@ -2368,12 +2516,13 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v t += dt4.yyyy(); q += dt4.zzzz(); - GSVector4 dc4 = m_slenv.dc4; + if(iip) + { + GSVector4i dc4 = m_slenv.dc4; - if(iip) r += dc4.xxxx(); - if(iip) g += dc4.yyyy(); - if(iip) b += dc4.zzzz(); - if(iip) a += dc4.wwww(); + rb = rb.add16(dc4.xxxx()); + ga = ga.add16(dc4.yyyy()); + } } } @@ -2402,31 +2551,66 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& const DWORD wzb = (sel >> 30) & 1; const DWORD tlu = (sel >> 31) & 1; - GSVector4i fa_base = m_slenv.fbr[top]; - GSVector4i* fa_offset = (GSVector4i*)&m_slenv.fbc[left & 3][left]; + const GSVector4 ps0123 = GSVector4::ps0123(); - GSVector4i za_base = m_slenv.zbr[top]; - GSVector4i* za_offset = (GSVector4i*)&m_slenv.zbc[left & 3][left]; + GSVector4i fa_base; + GSVector4i* fa_offset; + + GSVector4i za_base; + GSVector4i* za_offset; - GSVector4 ps0123 = GSVector4::ps0123(); + GSVector4 z, s, t, q; + GSVector4i f, rb, ga; + + // fa + + fa_base = m_slenv.fbr[top]; + fa_offset = (GSVector4i*)&m_slenv.fbc[left & 3][left]; + + // za + + za_base = m_slenv.zbr[top]; + za_offset = (GSVector4i*)&m_slenv.zbc[left & 3][left]; + + // v.p GSVector4 vp = v.p; - GSVector4 dp = m_slenv.dp; - GSVector4 z = vp.zzzz(); z += dp.zzzz() * ps0123; - GSVector4 f = vp.wwww(); f += dp.wwww() * ps0123; + + z = vp.zzzz() + m_slenv.dz; + + if(fge) + { + f = GSVector4i(vp).zzzzh().zzzz().add16(m_slenv.df); + } + + // v.t GSVector4 vt = v.t; GSVector4 dt = m_slenv.dt; - GSVector4 s = vt.xxxx(); s += dt.xxxx() * ps0123; - GSVector4 t = vt.yyyy(); t += dt.yyyy() * ps0123; - GSVector4 q = vt.zzzz(); q += dt.zzzz() * ps0123; - GSVector4 vc = v.c; - GSVector4 dc = m_slenv.dc; - GSVector4 r = vc.xxxx(); if(iip) r += dc.xxxx() * ps0123; - GSVector4 g = vc.yyyy(); if(iip) g += dc.yyyy() * ps0123; - GSVector4 b = vc.zzzz(); if(iip) b += dc.zzzz() * ps0123; - GSVector4 a = vc.wwww(); if(iip) a += dc.wwww() * ps0123; + if(tfx != TFX_NONE) + { + s = vt.xxxx() + dt.xxxx() * ps0123; + t = vt.yyyy() + dt.yyyy() * ps0123; + q = vt.zzzz() + dt.zzzz() * ps0123; + } + + // v.c + + GSVector4i vc = GSVector4i(v.c); + + vc = vc.upl16(vc.zwxy()); + + rb = vc.xxxx(); + ga = vc.zzzz(); + + if(iip) + { + rb = rb.add16(m_slenv.drb); + ga = ga.add16(m_slenv.dga); + } + + // int steps = right - left; @@ -2447,7 +2631,7 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& int pixels = GSVector4i::min_i16(steps, 4); - GSVector4 c[12]; + GSVector4i c[6]; if(tfx != TFX_NONE) { @@ -2463,29 +2647,29 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& if(ltf) { - u -= 0.5f; - v -= 0.5f; + u -= 2048.0f; + v -= 2048.0f; } } SampleTexture(pixels, ztst, ltf, tlu, test, u, v, c); } - AlphaTFX(tfx, tcc, a, c[3]); + AlphaTFX(tfx, tcc, ga, c[1]); GSVector4i fm = m_slenv.fm; GSVector4i zm = m_slenv.zm; - if(!TestAlpha(atst, afail, c[3], fm, zm, test)) + if(!TestAlpha(atst, afail, c[1], fm, zm, test)) { continue; } - ColorTFX(tfx, r, g, b, a, c[0], c[1], c[2]); + ColorTFX(tfx, rb, ga, c[0], c[1]); if(fge) { - Fog(f, c[0], c[1], c[2]); + Fog(f, c[0], c[1]); } GSVector4i fa = fa_base + GSVector4i::load(fa_offset); @@ -2512,46 +2696,45 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& if(abe != 255) { - // GSVector4::expand(d, c[4], c[5], c[6], c[7]); - - c[4] = (d << 24) >> 24; - c[5] = (d << 16) >> 24; - c[6] = (d << 8) >> 24; - c[7] = (d >> 24); + c[2] = d & 0x00ff00ff; + c[3] = (d >> 8) & 0x00ff00ff; if(fpsm == 1) { - c[7] = GSVector4(128.0f); + c[3] = GSVector4i(0x00800000).mix16(c[3]); } - c[8] = GSVector4::zero(); - c[9] = GSVector4::zero(); - c[10] = GSVector4::zero(); - c[11] = m_slenv.afix; + c[4] = GSVector4::zero(); + c[5] = m_slenv.afix; - /* - GSVector4 r = (c[abea*4 + 0] - c[abeb*4 + 0]).mod2x(c[abec*4 + 3]) + c[abed*4 + 0]; - GSVector4 g = (c[abea*4 + 1] - c[abeb*4 + 1]).mod2x(c[abec*4 + 3]) + c[abed*4 + 1]; - GSVector4 b = (c[abea*4 + 2] - c[abeb*4 + 2]).mod2x(c[abec*4 + 3]) + c[abed*4 + 2]; - */ + GSVector4i a = c[abec * 2 + 1].yywwl().yywwh().sll16(5); +/* + GSVector4i drb = c[abea * 2 + 0].sub16(c[abeb * 2 + 0]).sll16(4); + GSVector4i dga = c[abea * 2 + 1].sub16(c[abeb * 2 + 1]).sll16(4); - GSVector4 r, g, b; + GSVector4i rb = drb.mul16hs(a).add16(c[abed * 2 + 0]); + GSVector4i ga = dga.mul16hs(a).add16(c[abed * 2 + 1]); +*/ + GSVector4i rb, ga; if(abea != abeb) { - r = c[abea*4 + 0]; - g = c[abea*4 + 1]; - b = c[abea*4 + 2]; + rb = c[abea * 2 + 0]; + ga = c[abea * 2 + 1]; if(abeb != 2) { - r -= c[abeb*4 + 0]; - g -= c[abeb*4 + 1]; - b -= c[abeb*4 + 2]; + rb = rb.sub16(c[abeb * 2 + 0]); + ga = ga.sub16(c[abeb * 2 + 1]); } if(!(fpsm == 1 && abec == 1)) { + rb = rb.sll16(4).mul16hs(a); + ga = ga.sll16(4).mul16hs(a); + + /* TODO + if(abec == 2) { r *= m_slenv.afix2; @@ -2564,45 +2747,38 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& g = g.mod2x(c[abec*4 + 3]); b = b.mod2x(c[abec*4 + 3]); } + + */ } if(abed < 2) { - r += c[abed*4 + 0]; - g += c[abed*4 + 1]; - b += c[abed*4 + 2]; + rb = rb.add16(c[abed * 2 + 0]); + ga = ga.add16(c[abed * 2 + 1]); } } else { - r = c[abed*4 + 0]; - g = c[abed*4 + 1]; - b = c[abed*4 + 2]; + rb = c[abed * 2 + 0]; + ga = c[abed * 2 + 1]; } if(pabe) { - GSVector4 mask = c[3] >= GSVector4(128.0f); + GSVector4i mask = (c[1] << 8).sra32(31); - c[0] = c[0].blend8(r, mask); - c[1] = c[1].blend8(g, mask); - c[2] = c[2].blend8(b, mask); - } - else - { - c[0] = r; - c[1] = g; - c[2] = b; + rb = c[0].blend8(rb, mask); + ga = c[1].blend8(ga, mask); } + + c[0] = rb; + c[1] = c[1].mix16(ga); } - GSVector4i rb = GSVector4i(c[0]).ps32(GSVector4i(c[2])); - GSVector4i ga = GSVector4i(c[1]).ps32(GSVector4i(c[3])); - - GSVector4i rg = rb.upl16(ga) & m_slenv.colclamp; - GSVector4i ba = rb.uph16(ga) & m_slenv.colclamp; - - GSVector4i s = rg.upl32(ba).pu16(rg.uph32(ba)); + c[0] &= m_slenv.colclamp; + c[1] &= m_slenv.colclamp; + + GSVector4i s = c[0].upl16(c[1]).pu16(c[0].uph16(c[1])); if(fpsm != 1) { @@ -2614,7 +2790,7 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& s = s.blend(d, fm); } - WriteFrameAndZBufX(fpsm == 1 && rfb ? 0 : fpsm, fa, fm, s, wzb ? zpsm : 3, za, zm, zs, pixels); + WriteFrameAndZBufX(fpsm, fa, fm, s, ztst > 0 ? zpsm : 3, za, zm, zs, pixels); } while(0); @@ -2625,22 +2801,28 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& fa_offset++; za_offset++; - GSVector4 dp4 = m_slenv.dp4; + z += m_slenv.dz4; - z += dp4.zzzz(); - f += dp4.wwww(); + if(fge) + { + f = f.add16(m_slenv.df4); + } - GSVector4 dt4 = m_slenv.dt4; + if(tfx != TFX_NONE) + { + GSVector4 dt4 = m_slenv.dt4; - s += dt4.xxxx(); - t += dt4.yyyy(); - q += dt4.zzzz(); + s += dt4.xxxx(); + t += dt4.yyyy(); + q += dt4.zzzz(); + } - GSVector4 dc4 = m_slenv.dc4; + if(iip) + { + GSVector4i dc4 = m_slenv.dc4; - if(iip) r += dc4.xxxx(); - if(iip) g += dc4.yyyy(); - if(iip) b += dc4.zzzz(); - if(iip) a += dc4.wwww(); + rb = rb.add16(dc4.xxxx()); + ga = ga.add16(dc4.yyyy()); + } } } diff --git a/gsdx/GSDrawScanline.h b/gsdx/GSDrawScanline.h index 306f426..0a1dd34 100644 --- a/gsdx/GSDrawScanline.h +++ b/gsdx/GSDrawScanline.h @@ -84,13 +84,13 @@ class GSDrawScanline : public GSAlignedClass<16>, public IDrawScanline GSVector4i colclamp; GSVector4i fba; GSVector4i aref; - GSVector4 afix; - GSVector4 afix2; - GSVector4 fc; + GSVector4i afix, afix2; + GSVector4i frb, fga; - GSVector4 dp, dp4; + GSVector4 dz, dz4; + GSVector4i df, df4; GSVector4 dt, dt4; - GSVector4 dc, dc4; + GSVector4i drb, dga, dc4; }; struct Offset @@ -123,12 +123,12 @@ class GSDrawScanline : public GSAlignedClass<16>, public IDrawScanline __forceinline GSVector4i Wrap(const GSVector4i& t); - __forceinline void SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD pal, const GSVector4i& test, const GSVector4& u, const GSVector4& v, GSVector4* c); - __forceinline void ColorTFX(DWORD tfx, const GSVector4& rf, const GSVector4& gf, const GSVector4& bf, const GSVector4& af, GSVector4& rt, GSVector4& gt, GSVector4& bt); - __forceinline void AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4& af, GSVector4& at); - __forceinline void Fog(const GSVector4& f, GSVector4& r, GSVector4& g, GSVector4& b); + __forceinline void SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD pal, const GSVector4i& test, const GSVector4& u, const GSVector4& v, GSVector4i* c); + __forceinline void ColorTFX(DWORD tfx, const GSVector4i& rbf, const GSVector4i& gaf, GSVector4i& rbt, GSVector4i& gat); + __forceinline void AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4i& gaf, GSVector4i& gat); + __forceinline void Fog(const GSVector4i& f, GSVector4i& rb, GSVector4i& ga); __forceinline bool TestZ(DWORD zpsm, DWORD ztst, const GSVector4i& zs, const GSVector4i& za, GSVector4i& test); - __forceinline bool TestAlpha(DWORD atst, DWORD afail, const GSVector4& a, GSVector4i& fm, GSVector4i& zm, GSVector4i& test); + __forceinline bool TestAlpha(DWORD atst, DWORD afail, const GSVector4i& ga, GSVector4i& fm, GSVector4i& zm, GSVector4i& test); __forceinline static DWORD ReadPixel32(DWORD* RESTRICT vm, DWORD addr); __forceinline static DWORD ReadPixel24(DWORD* RESTRICT vm, DWORD addr); diff --git a/gsdx/GSRendererSW.cpp b/gsdx/GSRendererSW.cpp index f49c4e3..93eaa23 100644 --- a/gsdx/GSRendererSW.cpp +++ b/gsdx/GSRendererSW.cpp @@ -22,4 +22,4 @@ #include "StdAfx.h" #include "GSRendererSW.h" -const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 1.0f / 256); +const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f); diff --git a/gsdx/GSRendererSW.h b/gsdx/GSRendererSW.h index 4ef735a..b576638 100644 --- a/gsdx/GSRendererSW.h +++ b/gsdx/GSRendererSW.h @@ -121,12 +121,11 @@ protected: int x = (int)m_v.XYZ.X - (int)m_context->XYOFFSET.OFX; int y = (int)m_v.XYZ.Y - (int)m_context->XYOFFSET.OFY; - GSVector4i p(x, y, 0, (int)m_v.FOG.F); - - v.p = GSVector4(p) * g_pos_scale; + v.p = GSVector4(x, y, 0, (int)m_v.FOG.F) * g_pos_scale; v.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the DWORD=>float=>DWORD conversion v.c = (DWORD)m_v.RGBAQ.ai32[0]; + v.c *= 128.0f; if(PRIM->TME) { @@ -134,14 +133,14 @@ protected: { v.t.x = (float)(int)m_v.UV.U; v.t.y = (float)(int)m_v.UV.V; - v.t *= 1.0f / 16; + v.t *= 4096.0f / 16; v.t.z = 1.0f; } else { v.t.x = m_v.ST.S; v.t.y = m_v.ST.T; - v.t *= GSVector4((float)(1 << m_context->TEX0.TW), (float)(1 << m_context->TEX0.TH)); + v.t *= GSVector4((float)(4096 << m_context->TEX0.TW), (float)(4096 << m_context->TEX0.TH)); v.t.z = m_v.RGBAQ.Q; } } diff --git a/gsdx/GSVector.h b/gsdx/GSVector.h index 9e7736b..035db33 100644 --- a/gsdx/GSVector.h +++ b/gsdx/GSVector.h @@ -329,6 +329,11 @@ public: return store(load(a).min_i16(load(b))); } + GSVector4i clamp8() const + { + return pu16().upl8(); + } + GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const { return GSVector4i(_mm_blendv_epi8(m, a, mask)); @@ -348,6 +353,19 @@ public: return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, a))); } + GSVector4i mix16(const GSVector4i& a) const + { + #if _M_SSE >= 0x401 + + return blend16<0x55>(a); + + #else + + return blend8(a, GSVector4i::x0000ffff()); + + #endif + } + #if _M_SSE >= 0x301 GSVector4i shuffle8(const GSVector4i& mask) const @@ -677,6 +695,21 @@ public: return GSVector4i(_mm_cmpeq_epi32(m, v.m)); } + GSVector4i neq8(const GSVector4i& v) const + { + return ~eq8(v); + } + + GSVector4i neq16(const GSVector4i& v) const + { + return ~eq16(v); + } + + GSVector4i neq32(const GSVector4i& v) const + { + return ~eq32(v); + } + GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); @@ -1212,6 +1245,11 @@ public: return invzero().srl16(8); } + static GSVector4i xff00() + { + return invzero().sll16(8); + } + static GSVector4i x000000ff() { return invzero().srl32(24); @@ -1227,6 +1265,16 @@ public: return invzero().sll32(24); } + static GSVector4i x00000fff() + { + return invzero().srl32(20); + } + + static GSVector4i x0000ffff() + { + return invzero().srl32(16); + } + static GSVector4i x00ffffff() { return invzero().srl32(8); @@ -1265,6 +1313,11 @@ public: return invzero(v).srl16(8); } + static GSVector4i xff00(const GSVector4i& v) + { + return invzero(v).sll16(8); + } + static GSVector4i x000000ff(const GSVector4i& v) { return invzero(v).srl32(24); @@ -1280,6 +1333,16 @@ public: return invzero(v).sll32(24); } + static GSVector4i x00000fff(const GSVector4i& v) + { + return invzero(v).srl32(20); + } + + static GSVector4i x0000ffff(const GSVector4i& v) + { + return invzero(v).srl32(16); + } + static GSVector4i x00ffffff(const GSVector4i& v) { return invzero(v).srl32(8);