From 476850c461425131c638c819feaa0d6044d75abe Mon Sep 17 00:00:00 2001 From: gabest Date: Thu, 11 Dec 2008 03:10:31 +0000 Subject: [PATCH] --- gsdx/GPUDrawScanline.cpp | 66 +++++++++++----------- gsdx/GSDrawScanline.cpp | 116 ++++++++++++++++++++------------------- gsdx/GSRendererSW.h | 4 +- gsdx/GSVector.h | 69 +++++++++++++++++++++++ 4 files changed, 164 insertions(+), 91 deletions(-) diff --git a/gsdx/GPUDrawScanline.cpp b/gsdx/GPUDrawScanline.cpp index d2aab01..dc655ef 100644 --- a/gsdx/GPUDrawScanline.cpp +++ b/gsdx/GPUDrawScanline.cpp @@ -166,8 +166,8 @@ void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin GSVector4i u1 = u0.add16(GSVector4i::x0001()); GSVector4i v1 = v0.add16(GSVector4i::x0001()); - GSVector4i uf = u & GSVector4i::x00ff(); - GSVector4i vf = v & GSVector4i::x00ff(); + GSVector4i uf = (u & GSVector4i::x00ff()) << 7; + GSVector4i vf = (v & GSVector4i::x00ff()) << 7; if(twin) { @@ -247,41 +247,41 @@ void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin #endif - GSVector4i r00 = (c00 & 0x001f001f) << 2; - GSVector4i r01 = (c01 & 0x001f001f) << 2; - GSVector4i r10 = (c10 & 0x001f001f) << 2; - GSVector4i r11 = (c11 & 0x001f001f) << 2; + GSVector4i r00 = (c00 & 0x001f001f) << 3; + GSVector4i r01 = (c01 & 0x001f001f) << 3; + GSVector4i r10 = (c10 & 0x001f001f) << 3; + GSVector4i r11 = (c11 & 0x001f001f) << 3; - r00 = r00.add16(r01.sub16(r00).mul16l(uf).sra16(8)); - r10 = r10.add16(r11.sub16(r10).mul16l(uf).sra16(8)); - c[0] = r00.add16(r10.sub16(r00).mul16l(vf).sra16(8)) << 1; + r00 = r00.lerp16<0>(r01, uf); + r10 = r10.lerp16<0>(r11, uf); + c[0] = r00.lerp16<0>(r10, vf); - GSVector4i g00 = (c00 & 0x03e003e0) >> 3; - GSVector4i g01 = (c01 & 0x03e003e0) >> 3; - GSVector4i g10 = (c10 & 0x03e003e0) >> 3; - GSVector4i g11 = (c11 & 0x03e003e0) >> 3; + GSVector4i g00 = (c00 & 0x03e003e0) >> 2; + GSVector4i g01 = (c01 & 0x03e003e0) >> 2; + GSVector4i g10 = (c10 & 0x03e003e0) >> 2; + GSVector4i g11 = (c11 & 0x03e003e0) >> 2; - g00 = g00.add16(g01.sub16(g00).mul16l(uf).sra16(8)); - g10 = g10.add16(g11.sub16(g10).mul16l(uf).sra16(8)); - c[1] = g00.add16(g10.sub16(g00).mul16l(vf).sra16(8)) << 1; + g00 = g00.lerp16<0>(g01, uf); + g10 = g10.lerp16<0>(g11, uf); + c[1] = g00.lerp16<0>(g10, vf); - GSVector4i b00 = (c00 & 0x7c007c00) >> 8; - GSVector4i b01 = (c01 & 0x7c007c00) >> 8; - GSVector4i b10 = (c10 & 0x7c007c00) >> 8; - GSVector4i b11 = (c11 & 0x7c007c00) >> 8; + GSVector4i b00 = (c00 & 0x7c007c00) >> 7; + GSVector4i b01 = (c01 & 0x7c007c00) >> 7; + GSVector4i b10 = (c10 & 0x7c007c00) >> 7; + GSVector4i b11 = (c11 & 0x7c007c00) >> 7; - b00 = b00.add16(b01.sub16(b00).mul16l(uf).sra16(8)); - b10 = b10.add16(b11.sub16(b10).mul16l(uf).sra16(8)); - c[2] = b00.add16(b10.sub16(b00).mul16l(vf).sra16(8)) << 1; + b00 = b00.lerp16<0>(b01, uf); + b10 = b10.lerp16<0>(b11, uf); + c[2] = b00.lerp16<0>(b10, vf); - GSVector4i a00 = (c00 & 0x80008000) >> 9; - GSVector4i a01 = (c01 & 0x80008000) >> 9; - GSVector4i a10 = (c10 & 0x80008000) >> 9; - GSVector4i a11 = (c11 & 0x80008000) >> 9; + GSVector4i a00 = (c00 & 0x80008000) >> 8; + GSVector4i a01 = (c01 & 0x80008000) >> 8; + GSVector4i a10 = (c10 & 0x80008000) >> 8; + GSVector4i a11 = (c11 & 0x80008000) >> 8; - a00 = a00.add16(a01.sub16(a00).mul16l(uf).sra16(8)); - a10 = a10.add16(a11.sub16(a10).mul16l(uf).sra16(8)); - c[3] = a00.add16(a10.sub16(a00).mul16l(vf).sra16(8)).gt16(GSVector4i::zero()); + a00 = a00.lerp16<0>(a01, uf); + a10 = a10.lerp16<0>(a11, uf); + c[3] = a00.lerp16<0>(a10, vf).gt16(GSVector4i::zero()); // mask out blank pixels (not perfect) @@ -375,9 +375,9 @@ void GPUDrawScanline::ColorTFX(DWORD tfx, const GSVector4i& r, const GSVector4i& c[2] = b.srl16(7); break; case 2: // modulate (tfx = tme | tge) - c[0] = c[0].sll16(2).mul16hu(r).clamp8(); - c[1] = c[1].sll16(2).mul16hu(g).clamp8(); - c[2] = c[2].sll16(2).mul16hu(b).clamp8(); + c[0] = c[0].modulate16<1>(r).clamp8(); + c[1] = c[1].modulate16<1>(g).clamp8(); + c[2] = c[2].modulate16<1>(b).clamp8(); break; case 3: // decal (tfx = tme) break; diff --git a/gsdx/GSDrawScanline.cpp b/gsdx/GSDrawScanline.cpp index 26c73fd..8edc9e1 100644 --- a/gsdx/GSDrawScanline.cpp +++ b/gsdx/GSDrawScanline.cpp @@ -409,7 +409,7 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, GSVector4i ui = GSVector4i(u); GSVector4i vi = GSVector4i(v); - GSVector4i uv = (ui.sra32(12)).ps32(vi.sra32(12)); + GSVector4i uv = ui.sra32(15).ps32(vi.sra32(15)); GSVector4i uv0, uv1; GSVector4i addr00, addr01, addr10, addr11; @@ -417,17 +417,24 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, if(ltf) { - GSVector4i uvf = (ui & GSVector4i::x00000fff()).ps32(vi & GSVector4i::x00000fff()); + GSVector4i mask = GSVector4i::x00007fff(); + + GSVector4i uvf = (ui & mask).ps32(vi & mask); GSVector4i uf = uvf.upl16(uvf); GSVector4i vf = uvf.uph16(uvf); uv0 = Wrap(uv); uv1 = Wrap(uv.add16(GSVector4i::x0001())); - addr00 = (uv0.uph16() << tw) + uv0.upl16(); - addr01 = (uv0.uph16() << tw) + uv1.upl16(); - addr10 = (uv1.uph16() << tw) + uv0.upl16(); - addr11 = (uv1.uph16() << tw) + uv1.upl16(); + GSVector4i y0 = uv0.uph16() << tw; + GSVector4i y1 = uv1.uph16() << tw; + GSVector4i x0 = uv0.upl16(); + GSVector4i x1 = uv1.upl16(); + + addr00 = y0 + x0; + addr01 = y0 + x1; + addr10 = y1 + x0; + addr11 = y1 + x1; #if _M_SSE >= 0x401 @@ -485,23 +492,25 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, #endif - GSVector4i rb00 = c00 & GSVector4i::x00ff(); - GSVector4i rb01 = c01 & GSVector4i::x00ff(); - GSVector4i rb10 = c10 & GSVector4i::x00ff(); - GSVector4i rb11 = c11 & GSVector4i::x00ff(); + mask = GSVector4i::x00ff(); - GSVector4i ga00 = (c00 >> 8) & GSVector4i::x00ff(); - GSVector4i ga01 = (c01 >> 8) & GSVector4i::x00ff(); - GSVector4i ga10 = (c10 >> 8) & GSVector4i::x00ff(); - GSVector4i ga11 = (c11 >> 8) & GSVector4i::x00ff(); + GSVector4i rb00 = c00 & mask; + GSVector4i rb01 = c01 & mask; + GSVector4i rb10 = c10 & mask; + GSVector4i rb11 = c11 & mask; - rb00 = rb00.add16(rb01.sub16(rb00).sll16(4).mul16hs(uf)); - rb10 = rb10.add16(rb11.sub16(rb10).sll16(4).mul16hs(uf)); - rb00 = rb00.add16(rb10.sub16(rb00).sll16(4).mul16hs(vf)); + GSVector4i ga00 = (c00 >> 8) & mask; + GSVector4i ga01 = (c01 >> 8) & mask; + GSVector4i ga10 = (c10 >> 8) & mask; + GSVector4i ga11 = (c11 >> 8) & mask; - ga00 = ga00.add16(ga01.sub16(ga00).sll16(4).mul16hs(uf)); - ga10 = ga10.add16(ga11.sub16(ga10).sll16(4).mul16hs(uf)); - ga00 = ga00.add16(ga10.sub16(ga00).sll16(4).mul16hs(vf)); + rb00 = rb00.lerp16<0>(rb01, uf); + rb10 = rb10.lerp16<0>(rb11, uf); + rb00 = rb00.lerp16<0>(rb10, vf); + + ga00 = ga00.lerp16<0>(ga01, uf); + ga10 = ga10.lerp16<0>(ga11, uf); + ga00 = ga00.lerp16<0>(ga10, vf); c[0] = rb00; c[1] = ga00; @@ -556,8 +565,10 @@ void GSDrawScanline::SampleTexture(int pixels, DWORD ztst, DWORD ltf, DWORD tlu, #endif - c[0] = c00 & GSVector4i::x00ff(); - c[1] = (c00 >> 8) & GSVector4i::x00ff(); + GSVector4i mask = GSVector4i::x00ff(); + + c[0] = c00 & mask; + c[1] = (c00 >> 8) & mask; } } @@ -568,15 +579,15 @@ void GSDrawScanline::ColorTFX(DWORD tfx, const GSVector4i& rbf, const GSVector4i switch(tfx) { case TFX_MODULATE: - rbt = rbt.sll16(2).mul16hu(rbf).clamp8(); + rbt = rbt.modulate16<1>(rbf).clamp8(); break; case TFX_DECAL: break; case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: - af = gaf.srl16(7).yywwl().yywwh(); - rbt = rbt.sll16(2).mul16hu(rbf).add16(af).clamp8(); - gat = gat.sll16(2).mul16hu(gaf).add16(af).clamp8().mix16(gat); + af = gaf.yywwl().yywwh().srl16(7); + rbt = rbt.modulate16<1>(rbf).add16(af).clamp8(); + gat = gat.modulate16<1>(rbf).add16(af).clamp8().mix16(gat); break; case TFX_NONE: rbt = rbf.srl16(7); @@ -591,7 +602,7 @@ void GSDrawScanline::AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4i& gaf, GSVec switch(tfx) { case TFX_MODULATE: - gat = gat.sll16(2).mul16hu(gaf).clamp8(); + gat = gat.modulate16<1>(gaf).clamp8(); if(!tcc) gat = gat.mix16(gaf.srl16(7)); break; case TFX_DECAL: @@ -613,13 +624,8 @@ void GSDrawScanline::AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4i& gaf, GSVec void GSDrawScanline::Fog(const GSVector4i& f, GSVector4i& rb, GSVector4i& ga) { - GSVector4i frb = m_slenv.frb; - GSVector4i fga = m_slenv.fga; - - GSVector4i fog = f.srl16(3); - - rb = frb.add16(rb.sub16(frb).sll16(4).mul16hs(fog)); - ga = fga.add16(ga.sub16(fga).sll16(4).mul16hs(fog)).mix16(ga); + rb = m_slenv.frb.lerp16<0>(rb, f); + ga = m_slenv.fga.lerp16<0>(ga, f).mix16(ga); } bool GSDrawScanline::TestZ(DWORD zpsm, DWORD ztst, const GSVector4i& zs, const GSVector4i& za, GSVector4i& test) @@ -2339,8 +2345,8 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v if(m_sel.ltf) { - u -= 2048.0f; - v -= 2048.0f; + u -= 0x4000; + v -= 0x4000; } } @@ -2388,8 +2394,10 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v if(m_sel.abe != 255) { - c[2] = d & GSVector4i::x00ff(); - c[3] = (d >> 8) & GSVector4i::x00ff(); + GSVector4i mask = GSVector4i::x00ff(); + + c[2] = d & mask; + c[3] = (d >> 8) & mask; if(fpsm == 1) { @@ -2404,13 +2412,10 @@ void GSDrawScanline::DrawScanlineT(int top, int left, int right, const Vertex& v DWORD abec = m_sel.abec; DWORD abed = m_sel.abed; - GSVector4i a = c[abec * 2 + 1].yywwl().yywwh().sll16(5); + GSVector4i a = c[abec * 2 + 1].yywwl().yywwh().sll16(7); - GSVector4i drb = c[abea * 2 + 0].sub16(c[abeb * 2 + 0]); - GSVector4i dga = c[abea * 2 + 1].sub16(c[abeb * 2 + 1]); - - GSVector4i rb = drb.sll16(4).mul16hs(a).add16(c[abed * 2 + 0]); - GSVector4i ga = dga.sll16(4).mul16hs(a).add16(c[abed * 2 + 1]); + GSVector4i rb = GSVector4i::lerp16<1>(c[abea * 2 + 0], c[abeb * 2 + 0], a, c[abed * 2 + 0]); + GSVector4i ga = GSVector4i::lerp16<1>(c[abea * 2 + 1], c[abeb * 2 + 1], a, c[abed * 2 + 1]); if(m_sel.pabe) { @@ -2590,8 +2595,8 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& if(ltf) { - u -= 2048.0f; - v -= 2048.0f; + u -= 0x4000; + v -= 0x4000; } } @@ -2639,8 +2644,10 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& if(abe != 255) { - c[2] = d & GSVector4i::x00ff(); - c[3] = (d >> 8) & GSVector4i::x00ff(); + GSVector4i mask = GSVector4i::x00ff(); + + c[2] = d & mask; + c[3] = (d >> 8) & mask; if(fpsm == 1) { @@ -2650,13 +2657,10 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& c[4] = GSVector4::zero(); c[5] = m_slenv.afix; - GSVector4i a = c[abec * 2 + 1].yywwl().yywwh().sll16(5); + GSVector4i a = c[abec * 2 + 1].yywwl().yywwh().sll16(7); /* - GSVector4i drb = c[abea * 2 + 0].sub16(c[abeb * 2 + 0]).sll16(4); - GSVector4i dga = c[abea * 2 + 1].sub16(c[abeb * 2 + 1]).sll16(4); - - GSVector4i rb = drb.mul16hs(a).add16(c[abed * 2 + 0]); - GSVector4i ga = dga.mul16hs(a).add16(c[abed * 2 + 1]); + GSVector4i rb = GSVector4i::lerp16<1>(c[abea * 2 + 0], c[abeb * 2 + 0], a, c[abed * 2 + 0]); + GSVector4i ga = GSVector4i::lerp16<1>(c[abea * 2 + 1], c[abeb * 2 + 1], a, c[abed * 2 + 1]); */ GSVector4i rb, ga; @@ -2673,8 +2677,8 @@ void GSDrawScanline::DrawScanlineExT(int top, int left, int right, const Vertex& if(!(fpsm == 1 && abec == 1)) { - rb = rb.sll16(4).mul16hs(a); - ga = ga.sll16(4).mul16hs(a); + rb = rb.sll16(2).mul16hs(a); + ga = ga.sll16(2).mul16hs(a); /* TODO diff --git a/gsdx/GSRendererSW.h b/gsdx/GSRendererSW.h index b576638..6d76a58 100644 --- a/gsdx/GSRendererSW.h +++ b/gsdx/GSRendererSW.h @@ -133,14 +133,14 @@ protected: { v.t.x = (float)(int)m_v.UV.U; v.t.y = (float)(int)m_v.UV.V; - v.t *= 4096.0f / 16; + v.t *= 0x8000 >> 4; v.t.z = 1.0f; } else { v.t.x = m_v.ST.S; v.t.y = m_v.ST.T; - v.t *= GSVector4((float)(4096 << m_context->TEX0.TW), (float)(4096 << m_context->TEX0.TH)); + v.t *= GSVector4((float)(0x8000 << m_context->TEX0.TW), (float)(0x8000 << m_context->TEX0.TH)); v.t.z = m_v.RGBAQ.Q; } } diff --git a/gsdx/GSVector.h b/gsdx/GSVector.h index 267e44c..35aa66b 100644 --- a/gsdx/GSVector.h +++ b/gsdx/GSVector.h @@ -680,6 +680,75 @@ public: #endif + template GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const + { + // (a - this) * f << shift + this + + GSVector4i v = a.sub16(*this); + + #if _M_SSE >= 0x301 + + if(shift > 0) v = v.sll16(shift); + + v = v.mul16hrs(f); + + #else + + v = v.sll16(shift + 1); + + v = v.mul16hs(f); + + #endif + + return add16(v); + } + + template static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, const GSVector4i& d) + { + // (a - b) * c << shift + d + + GSVector4i v = a.sub16(b); + + #if _M_SSE >= 0x301 + + if(shift > 0) v = v.sll16(shift); + + v = v.mul16hrs(c); + + #else + + v = v.sll16(shift + 1); + + v = v.mul16hs(c); + + #endif + + return d.add16(v); + } + + template GSVector4i modulate16(const GSVector4i& f) const + { + // a * f << shift + + GSVector4i v = *this; + + #if _M_SSE >= 0x301 + + if(shift > 0) v = v.sll16(shift); + + v = v.mul16hrs(f); + + #else + + v = v.sll16(shift + 1); + + v = v.mul16hs(f); + + #endif + + return v; + } + GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m));