mirror of
https://github.com/libretro/pcsx2.git
synced 2025-01-17 15:02:43 +00:00
33adabb035
In the future these will be _M_X86_64, but for now this won't be the case.
1032 lines
19 KiB
C++
1032 lines
19 KiB
C++
/*
|
|
* Copyright (C) 2007-2009 Gabest
|
|
* http://www.gabest.org
|
|
*
|
|
* This Program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2, or (at your option)
|
|
* any later version.
|
|
*
|
|
* This Program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with GNU Make; see the file COPYING. If not, write to
|
|
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
*
|
|
*/
|
|
|
|
// TODO: x64
|
|
|
|
#include "stdafx.h"
|
|
#include "GPUDrawScanlineCodeGenerator.h"
|
|
#include "GSVertexSW.h"
|
|
|
|
static const int _args = 8;
|
|
static const int _top = _args + 4;
|
|
static const int _v = _args + 8;
|
|
|
|
GPUDrawScanlineCodeGenerator::GPUDrawScanlineCodeGenerator(void* param, uint32 key, void* code, size_t maxsize)
|
|
: GSCodeGenerator(code, maxsize)
|
|
, m_local(*(GPUScanlineLocalData*)param)
|
|
{
|
|
m_sel.key = key;
|
|
|
|
Generate();
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::Generate()
|
|
{
|
|
push(esi);
|
|
push(edi);
|
|
|
|
Init();
|
|
|
|
align(16);
|
|
|
|
L("loop");
|
|
|
|
// GSVector4i test = m_test[7 + (steps & (steps >> 31))];
|
|
|
|
mov(edx, ecx);
|
|
sar(edx, 31);
|
|
and(edx, ecx);
|
|
shl(edx, 4);
|
|
|
|
movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
|
|
|
|
// movdqu(xmm1, ptr[edi]);
|
|
|
|
movq(xmm1, qword[edi]);
|
|
movhps(xmm1, qword[edi + 8]);
|
|
|
|
// ecx = steps
|
|
// esi = tex (tme)
|
|
// edi = fb
|
|
// xmm1 = fd
|
|
// xmm2 = s
|
|
// xmm3 = t
|
|
// xmm4 = r
|
|
// xmm5 = g
|
|
// xmm6 = b
|
|
// xmm7 = test
|
|
|
|
TestMask();
|
|
|
|
SampleTexture();
|
|
|
|
// xmm1 = fd
|
|
// xmm3 = a
|
|
// xmm4 = r
|
|
// xmm5 = g
|
|
// xmm6 = b
|
|
// xmm7 = test
|
|
// xmm0, xmm2 = free
|
|
|
|
ColorTFX();
|
|
|
|
AlphaBlend();
|
|
|
|
Dither();
|
|
|
|
WriteFrame();
|
|
|
|
L("step");
|
|
|
|
// if(steps <= 0) break;
|
|
|
|
test(ecx, ecx);
|
|
jle("exit", T_NEAR);
|
|
|
|
Step();
|
|
|
|
jmp("loop", T_NEAR);
|
|
|
|
L("exit");
|
|
|
|
pop(edi);
|
|
pop(esi);
|
|
|
|
ret(8);
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::Init()
|
|
{
|
|
mov(eax, dword[esp + _top]);
|
|
|
|
// uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left;
|
|
|
|
mov(edi, eax);
|
|
shl(edi, 10 + m_sel.scalex);
|
|
add(edi, edx);
|
|
lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]);
|
|
|
|
// int steps = pixels - 8;
|
|
|
|
sub(ecx, 8);
|
|
|
|
if(m_sel.dtd)
|
|
{
|
|
// dither = GSVector4i::load<false>(&m_dither[top & 3][left & 3]);
|
|
|
|
and(eax, 3);
|
|
shl(eax, 5);
|
|
and(edx, 3);
|
|
shl(edx, 1);
|
|
movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]);
|
|
movdqa(ptr[&m_local.temp.dither], xmm0);
|
|
}
|
|
|
|
mov(edx, dword[esp + _v]);
|
|
|
|
if(m_sel.tme)
|
|
{
|
|
mov(esi, dword[&m_local.gd->tex]);
|
|
|
|
// GSVector4i vt = GSVector4i(v.t).xxzzl();
|
|
|
|
cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]);
|
|
pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
|
|
|
|
// s = vt.xxxx().add16(m_local.d.s);
|
|
// t = vt.yyyy().add16(m_local.d.t);
|
|
|
|
pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
|
|
paddw(xmm2, ptr[&m_local.d.s]);
|
|
|
|
if(!m_sel.sprite)
|
|
{
|
|
paddw(xmm3, ptr[&m_local.d.t]);
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.ltf)
|
|
{
|
|
movdqa(xmm0, xmm3);
|
|
psllw(xmm0, 8);
|
|
psrlw(xmm0, 1);
|
|
movdqa(ptr[&m_local.temp.vf], xmm0);
|
|
}
|
|
}
|
|
|
|
movdqa(ptr[&m_local.temp.s], xmm2);
|
|
movdqa(ptr[&m_local.temp.t], xmm3);
|
|
}
|
|
|
|
if(m_sel.tfx != 3) // != decal
|
|
{
|
|
// GSVector4i vc = GSVector4i(v.c).xxzzlh();
|
|
|
|
cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]);
|
|
pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
|
|
pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
|
|
|
|
// r = vc.xxxx();
|
|
// g = vc.yyyy();
|
|
// b = vc.zzzz();
|
|
|
|
pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
|
|
pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
|
|
pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
|
|
|
|
if(m_sel.iip)
|
|
{
|
|
// r = r.add16(m_local.d.r);
|
|
// g = g.add16(m_local.d.g);
|
|
// b = b.add16(m_local.d.b);
|
|
|
|
paddw(xmm4, ptr[&m_local.d.r]);
|
|
paddw(xmm5, ptr[&m_local.d.g]);
|
|
paddw(xmm6, ptr[&m_local.d.b]);
|
|
}
|
|
|
|
movdqa(ptr[&m_local.temp.r], xmm4);
|
|
movdqa(ptr[&m_local.temp.g], xmm5);
|
|
movdqa(ptr[&m_local.temp.b], xmm6);
|
|
}
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::Step()
|
|
{
|
|
// steps -= 8;
|
|
|
|
sub(ecx, 8);
|
|
|
|
// fb += 8;
|
|
|
|
add(edi, 8 * sizeof(uint16));
|
|
|
|
if(m_sel.tme)
|
|
{
|
|
// GSVector4i st = m_local.d8.st;
|
|
|
|
movdqa(xmm4, ptr[&m_local.d8.st]);
|
|
|
|
// s = s.add16(st.xxxx());
|
|
// t = t.add16(st.yyyy());
|
|
|
|
pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
|
|
paddw(xmm2, ptr[&m_local.temp.s]);
|
|
movdqa(ptr[&m_local.temp.s], xmm2);
|
|
|
|
// TODO: if(!sprite) ... else reload t
|
|
|
|
pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));
|
|
paddw(xmm3, ptr[&m_local.temp.t]);
|
|
movdqa(ptr[&m_local.temp.t], xmm3);
|
|
}
|
|
|
|
if(m_sel.tfx != 3) // != decal
|
|
{
|
|
if(m_sel.iip)
|
|
{
|
|
// GSVector4i c = m_local.d8.c;
|
|
|
|
// r = r.add16(c.xxxx());
|
|
// g = g.add16(c.yyyy());
|
|
// b = b.add16(c.zzzz());
|
|
|
|
movdqa(xmm6, ptr[&m_local.d8.c]);
|
|
|
|
pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
|
|
pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
|
|
pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));
|
|
|
|
paddw(xmm4, ptr[&m_local.temp.r]);
|
|
paddw(xmm5, ptr[&m_local.temp.g]);
|
|
paddw(xmm6, ptr[&m_local.temp.b]);
|
|
|
|
movdqa(ptr[&m_local.temp.r], xmm4);
|
|
movdqa(ptr[&m_local.temp.g], xmm5);
|
|
movdqa(ptr[&m_local.temp.b], xmm6);
|
|
}
|
|
else
|
|
{
|
|
movdqa(xmm4, ptr[&m_local.temp.r]);
|
|
movdqa(xmm5, ptr[&m_local.temp.g]);
|
|
movdqa(xmm6, ptr[&m_local.temp.b]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::TestMask()
|
|
{
|
|
if(!m_sel.me)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// test |= fd.sra16(15);
|
|
|
|
movdqa(xmm0, xmm1);
|
|
psraw(xmm0, 15);
|
|
por(xmm7, xmm0);
|
|
|
|
alltrue();
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::SampleTexture()
|
|
{
|
|
if(!m_sel.tme)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(m_sel.tlu)
|
|
{
|
|
mov(edx, ptr[&m_local.gd->clut]);
|
|
}
|
|
|
|
// xmm2 = s
|
|
// xmm3 = t
|
|
// xmm7 = test
|
|
// xmm0, xmm4, xmm5, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
if(m_sel.ltf)
|
|
{
|
|
// GSVector4i u = s.sub16(GSVector4i(0x00200020)); // - 0.125f
|
|
// GSVector4i v = t.sub16(GSVector4i(0x00200020)); // - 0.125f
|
|
|
|
mov(eax, 0x00200020);
|
|
movd(xmm0, eax);
|
|
pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
psubw(xmm2, xmm0);
|
|
psubw(xmm3, xmm0);
|
|
|
|
// GSVector4i uf = (u & GSVector4i::x00ff()) << 7;
|
|
// GSVector4i vf = (v & GSVector4i::x00ff()) << 7;
|
|
|
|
movdqa(xmm0, xmm2);
|
|
psllw(xmm0, 8);
|
|
psrlw(xmm0, 1);
|
|
movdqa(ptr[&m_local.temp.uf], xmm0);
|
|
|
|
if(!m_sel.sprite)
|
|
{
|
|
movdqa(xmm0, xmm3);
|
|
psllw(xmm0, 8);
|
|
psrlw(xmm0, 1);
|
|
movdqa(ptr[&m_local.temp.vf], xmm0);
|
|
}
|
|
}
|
|
|
|
// GSVector4i u0 = s.srl16(8);
|
|
// GSVector4i v0 = t.srl16(8);
|
|
|
|
psrlw(xmm2, 8);
|
|
psrlw(xmm3, 8);
|
|
|
|
// xmm2 = u
|
|
// xmm3 = v
|
|
// xmm7 = test
|
|
// xmm0, xmm4, xmm5, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
if(m_sel.ltf)
|
|
{
|
|
// GSVector4i u1 = u0.add16(GSVector4i::x0001());
|
|
// GSVector4i v1 = v0.add16(GSVector4i::x0001());
|
|
|
|
movdqa(xmm4, xmm2);
|
|
movdqa(xmm5, xmm3);
|
|
|
|
pcmpeqd(xmm0, xmm0);
|
|
psrlw(xmm0, 15);
|
|
paddw(xmm4, xmm0);
|
|
paddw(xmm5, xmm0);
|
|
|
|
if(m_sel.twin)
|
|
{
|
|
// u0 = (u0 & m_local.twin[0].u).add16(m_local.twin[1].u);
|
|
// v0 = (v0 & m_local.twin[0].v).add16(m_local.twin[1].v);
|
|
// u1 = (u1 & m_local.twin[0].u).add16(m_local.twin[1].u);
|
|
// v1 = (v1 & m_local.twin[0].v).add16(m_local.twin[1].v);
|
|
|
|
movdqa(xmm0, ptr[&m_local.twin[0].u]);
|
|
movdqa(xmm6, ptr[&m_local.twin[1].u]);
|
|
|
|
pand(xmm2, xmm0);
|
|
paddw(xmm2, xmm6);
|
|
pand(xmm4, xmm0);
|
|
paddw(xmm4, xmm6);
|
|
|
|
movdqa(xmm0, ptr[&m_local.twin[0].v]);
|
|
movdqa(xmm6, ptr[&m_local.twin[1].v]);
|
|
|
|
pand(xmm3, xmm0);
|
|
paddw(xmm3, xmm6);
|
|
pand(xmm5, xmm0);
|
|
paddw(xmm5, xmm6);
|
|
}
|
|
else
|
|
{
|
|
// u0 = u0.min_i16(m_local.twin[2].u);
|
|
// v0 = v0.min_i16(m_local.twin[2].v);
|
|
// u1 = u1.min_i16(m_local.twin[2].u);
|
|
// v1 = v1.min_i16(m_local.twin[2].v);
|
|
|
|
// TODO: if(!sprite) clamp16 else:
|
|
|
|
movdqa(xmm0, ptr[&m_local.twin[2].u]);
|
|
movdqa(xmm6, ptr[&m_local.twin[2].v]);
|
|
|
|
pminsw(xmm2, xmm0);
|
|
pminsw(xmm3, xmm6);
|
|
pminsw(xmm4, xmm0);
|
|
pminsw(xmm5, xmm6);
|
|
}
|
|
|
|
// xmm2 = u0
|
|
// xmm3 = v0
|
|
// xmm4 = u1
|
|
// xmm5 = v1
|
|
// xmm7 = test
|
|
// xmm0, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
// GSVector4i addr00 = v0.sll16(8) | u0;
|
|
// GSVector4i addr01 = v0.sll16(8) | u1;
|
|
// GSVector4i addr10 = v1.sll16(8) | u0;
|
|
// GSVector4i addr11 = v1.sll16(8) | u1;
|
|
|
|
psllw(xmm3, 8);
|
|
movdqa(xmm0, xmm3);
|
|
por(xmm3, xmm2);
|
|
por(xmm0, xmm4);
|
|
|
|
psllw(xmm5, 8);
|
|
movdqa(xmm6, xmm5);
|
|
por(xmm5, xmm2);
|
|
por(xmm6, xmm4);
|
|
|
|
// xmm3 = addr00
|
|
// xmm0 = addr01
|
|
// xmm5 = addr10
|
|
// xmm6 = addr11
|
|
// xmm7 = test
|
|
// xmm2, xmm4 = free
|
|
// xmm1 = used
|
|
|
|
ReadTexel(xmm2, xmm3);
|
|
ReadTexel(xmm4, xmm0);
|
|
ReadTexel(xmm3, xmm5);
|
|
ReadTexel(xmm5, xmm6);
|
|
|
|
// xmm2 = c00
|
|
// xmm4 = c01
|
|
// xmm3 = c10
|
|
// xmm5 = c11
|
|
// xmm7 = test
|
|
// xmm0, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
// spill (TODO)
|
|
|
|
movdqa(ptr[&m_local.temp.fd], xmm1);
|
|
movdqa(ptr[&m_local.temp.test], xmm7);
|
|
|
|
// xmm2 = c00
|
|
// xmm4 = c01
|
|
// xmm3 = c10
|
|
// xmm5 = c11
|
|
// xmm0, xmm1, xmm6, xmm7 = free
|
|
|
|
movdqa(xmm1, xmm2);
|
|
psllw(xmm1, 11);
|
|
psrlw(xmm1, 8);
|
|
|
|
movdqa(xmm0, xmm4);
|
|
psllw(xmm0, 11);
|
|
psrlw(xmm0, 8);
|
|
|
|
lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.uf]);
|
|
|
|
movdqa(xmm6, xmm2);
|
|
psllw(xmm6, 6);
|
|
psrlw(xmm6, 11);
|
|
psllw(xmm6, 3);
|
|
|
|
movdqa(xmm1, xmm4);
|
|
psllw(xmm1, 6);
|
|
psrlw(xmm1, 11);
|
|
psllw(xmm1, 3);
|
|
|
|
lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.uf]);
|
|
|
|
movdqa(xmm7, xmm2);
|
|
psllw(xmm7, 1);
|
|
psrlw(xmm7, 11);
|
|
psllw(xmm7, 3);
|
|
|
|
movdqa(xmm6, xmm4);
|
|
psllw(xmm6, 1);
|
|
psrlw(xmm6, 11);
|
|
psllw(xmm6, 3);
|
|
|
|
lerp16<0>(xmm6, xmm7, ptr[&m_local.temp.uf]);
|
|
|
|
psraw(xmm2, 15);
|
|
psrlw(xmm2, 8);
|
|
psraw(xmm4, 15);
|
|
psrlw(xmm4, 8);
|
|
|
|
lerp16<0>(xmm4, xmm2, ptr[&m_local.temp.uf]);
|
|
|
|
// xmm0 = r00
|
|
// xmm1 = g00
|
|
// xmm6 = b00
|
|
// xmm4 = a00
|
|
// xmm3 = c10
|
|
// xmm5 = c11
|
|
// xmm2, xmm7 = free
|
|
|
|
movdqa(xmm7, xmm3);
|
|
psllw(xmm7, 11);
|
|
psrlw(xmm7, 8);
|
|
|
|
movdqa(xmm2, xmm5);
|
|
psllw(xmm2, 11);
|
|
psrlw(xmm2, 8);
|
|
|
|
lerp16<0>(xmm2, xmm7, ptr[&m_local.temp.uf]);
|
|
lerp16<0>(xmm2, xmm0, ptr[&m_local.temp.vf]);
|
|
|
|
// xmm2 = r
|
|
// xmm1 = g00
|
|
// xmm6 = b00
|
|
// xmm4 = a00
|
|
// xmm3 = c10
|
|
// xmm5 = c11
|
|
// xmm0, xmm7 = free
|
|
|
|
movdqa(xmm7, xmm3);
|
|
psllw(xmm7, 6);
|
|
psrlw(xmm7, 11);
|
|
psllw(xmm7, 3);
|
|
|
|
movdqa(xmm0, xmm5);
|
|
psllw(xmm0, 6);
|
|
psrlw(xmm0, 11);
|
|
psllw(xmm0, 3);
|
|
|
|
lerp16<0>(xmm0, xmm7, ptr[&m_local.temp.uf]);
|
|
lerp16<0>(xmm0, xmm1, ptr[&m_local.temp.vf]);
|
|
|
|
// xmm2 = r
|
|
// xmm0 = g
|
|
// xmm6 = b00
|
|
// xmm4 = a00
|
|
// xmm3 = c10
|
|
// xmm5 = c11
|
|
// xmm1, xmm7 = free
|
|
|
|
movdqa(xmm7, xmm3);
|
|
psllw(xmm7, 1);
|
|
psrlw(xmm7, 11);
|
|
psllw(xmm7, 3);
|
|
|
|
movdqa(xmm1, xmm5);
|
|
psllw(xmm1, 1);
|
|
psrlw(xmm1, 11);
|
|
psllw(xmm1, 3);
|
|
|
|
lerp16<0>(xmm1, xmm7, ptr[&m_local.temp.uf]);
|
|
lerp16<0>(xmm1, xmm6, ptr[&m_local.temp.vf]);
|
|
|
|
// xmm2 = r
|
|
// xmm0 = g
|
|
// xmm1 = b
|
|
// xmm4 = a00
|
|
// xmm3 = c10
|
|
// xmm5 = c11
|
|
// xmm6, xmm7 = free
|
|
|
|
psraw(xmm3, 15);
|
|
psrlw(xmm3, 8);
|
|
psraw(xmm5, 15);
|
|
psrlw(xmm5, 8);
|
|
|
|
lerp16<0>(xmm5, xmm3, ptr[&m_local.temp.uf]);
|
|
lerp16<0>(xmm5, xmm4, ptr[&m_local.temp.vf]);
|
|
|
|
// xmm2 = r
|
|
// xmm0 = g
|
|
// xmm1 = b
|
|
// xmm5 = a
|
|
// xmm3, xmm4, xmm6, xmm7 = free
|
|
|
|
// TODO
|
|
movdqa(xmm3, xmm5); // a
|
|
movdqa(xmm4, xmm2); // r
|
|
movdqa(xmm6, xmm1); // b
|
|
movdqa(xmm5, xmm0); // g
|
|
|
|
// reload test
|
|
|
|
movdqa(xmm7, ptr[&m_local.temp.test]);
|
|
|
|
// xmm4 = r
|
|
// xmm5 = g
|
|
// xmm6 = b
|
|
// xmm3 = a
|
|
// xmm7 = test
|
|
// xmm0, xmm1, xmm2 = free
|
|
|
|
// test |= (c[0] | c[1] | c[2] | c[3]).eq16(GSVector4i::zero()); // mask out blank pixels (not perfect)
|
|
|
|
movdqa(xmm1, xmm3);
|
|
por(xmm1, xmm4);
|
|
movdqa(xmm2, xmm5);
|
|
por(xmm2, xmm6);
|
|
por(xmm1, xmm2);
|
|
|
|
pxor(xmm0, xmm0);
|
|
pcmpeqw(xmm1, xmm0);
|
|
por(xmm7, xmm1);
|
|
|
|
// a = a.gt16(GSVector4i::zero());
|
|
|
|
pcmpgtw(xmm3, xmm0);
|
|
|
|
// reload fd
|
|
|
|
movdqa(xmm1, ptr[&m_local.temp.fd]);
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.twin)
|
|
{
|
|
// u = (u & m_local.twin[0].u).add16(m_local.twin[1].u);
|
|
// v = (v & m_local.twin[0].v).add16(m_local.twin[1].v);
|
|
|
|
pand(xmm2, ptr[&m_local.twin[0].u]);
|
|
paddw(xmm2, ptr[&m_local.twin[1].u]);
|
|
pand(xmm3, ptr[&m_local.twin[0].v]);
|
|
paddw(xmm3, ptr[&m_local.twin[1].v]);
|
|
}
|
|
else
|
|
{
|
|
// u = u.min_i16(m_local.twin[2].u);
|
|
// v = v.min_i16(m_local.twin[2].v);
|
|
|
|
// TODO: if(!sprite) clamp16 else:
|
|
|
|
pminsw(xmm2, ptr[&m_local.twin[2].u]);
|
|
pminsw(xmm3, ptr[&m_local.twin[2].v]);
|
|
}
|
|
|
|
// xmm2 = u
|
|
// xmm3 = v
|
|
// xmm7 = test
|
|
// xmm0, xmm4, xmm5, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
// GSVector4i addr = v.sll16(8) | u;
|
|
|
|
psllw(xmm3, 8);
|
|
por(xmm3, xmm2);
|
|
|
|
// xmm3 = addr
|
|
// xmm7 = test
|
|
// xmm0, xmm2, xmm4, xmm5, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
ReadTexel(xmm6, xmm3);
|
|
|
|
// xmm3 = c00
|
|
// xmm7 = test
|
|
// xmm0, xmm2, xmm4, xmm5, xmm6 = free
|
|
// xmm1 = used
|
|
|
|
// test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels
|
|
|
|
pxor(xmm0, xmm0);
|
|
pcmpeqw(xmm0, xmm6);
|
|
por(xmm7, xmm0);
|
|
|
|
// c[0] = (c00 << 3) & 0x00f800f8;
|
|
// c[1] = (c00 >> 2) & 0x00f800f8;
|
|
// c[2] = (c00 >> 7) & 0x00f800f8;
|
|
// c[3] = c00.sra16(15);
|
|
|
|
movdqa(xmm3, xmm6);
|
|
psraw(xmm3, 15); // a
|
|
|
|
pcmpeqd(xmm0, xmm0);
|
|
psrlw(xmm0, 11);
|
|
psllw(xmm0, 3); // 0x00f8
|
|
|
|
movdqa(xmm4, xmm6);
|
|
psllw(xmm4, 3);
|
|
pand(xmm4, xmm0); // r
|
|
|
|
movdqa(xmm5, xmm6);
|
|
psrlw(xmm5, 2);
|
|
pand(xmm5, xmm0); // g
|
|
|
|
psrlw(xmm6, 7);
|
|
pand(xmm6, xmm0); // b
|
|
}
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::ColorTFX()
|
|
{
|
|
switch(m_sel.tfx)
|
|
{
|
|
case 0: // none (tfx = 0)
|
|
case 1: // none (tfx = tge)
|
|
// c[0] = r.srl16(7);
|
|
// c[1] = g.srl16(7);
|
|
// c[2] = b.srl16(7);
|
|
psrlw(xmm4, 7);
|
|
psrlw(xmm5, 7);
|
|
psrlw(xmm6, 7);
|
|
break;
|
|
case 2: // modulate (tfx = tme | tge)
|
|
// c[0] = c[0].modulate16<1>(r).clamp8();
|
|
// c[1] = c[1].modulate16<1>(g).clamp8();
|
|
// c[2] = c[2].modulate16<1>(b).clamp8();
|
|
pcmpeqd(xmm0, xmm0);
|
|
psrlw(xmm0, 8);
|
|
modulate16<1>(xmm4, ptr[&m_local.temp.r]);
|
|
pminsw(xmm4, xmm0);
|
|
modulate16<1>(xmm5, ptr[&m_local.temp.g]);
|
|
pminsw(xmm5, xmm0);
|
|
modulate16<1>(xmm6, ptr[&m_local.temp.b]);
|
|
pminsw(xmm6, xmm0);
|
|
break;
|
|
case 3: // decal (tfx = tme)
|
|
break;
|
|
}
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::AlphaBlend()
|
|
{
|
|
if(!m_sel.abe)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// xmm1 = fd
|
|
// xmm3 = a
|
|
// xmm4 = r
|
|
// xmm5 = g
|
|
// xmm6 = b
|
|
// xmm7 = test
|
|
// xmm0, xmm2 = free
|
|
|
|
// GSVector4i r = (fd & 0x001f001f) << 3;
|
|
|
|
pcmpeqd(xmm0, xmm0);
|
|
psrlw(xmm0, 11); // 0x001f
|
|
movdqa(xmm2, xmm1);
|
|
pand(xmm2, xmm0);
|
|
psllw(xmm2, 3);
|
|
|
|
switch(m_sel.abr)
|
|
{
|
|
case 0:
|
|
// r = r.avg8(c[0]);
|
|
pavgb(xmm2, xmm4);
|
|
break;
|
|
case 1:
|
|
// r = r.addus8(c[0]);
|
|
paddusb(xmm2, xmm4);
|
|
break;
|
|
case 2:
|
|
// r = r.subus8(c[0]);
|
|
psubusb(xmm2, xmm4);
|
|
break;
|
|
case 3:
|
|
// r = r.addus8(c[0].srl16(2));
|
|
movdqa(xmm0, xmm4);
|
|
psrlw(xmm0, 2);
|
|
paddusb(xmm2, xmm0);
|
|
break;
|
|
}
|
|
|
|
if(m_sel.tme)
|
|
{
|
|
movdqa(xmm0, xmm3);
|
|
blend8(xmm4, xmm2);
|
|
}
|
|
else
|
|
{
|
|
movdqa(xmm4, xmm2);
|
|
}
|
|
|
|
// GSVector4i g = (d & 0x03e003e0) >> 2;
|
|
|
|
pcmpeqd(xmm0, xmm0);
|
|
psrlw(xmm0, 11);
|
|
psllw(xmm0, 5); // 0x03e0
|
|
movdqa(xmm2, xmm1);
|
|
pand(xmm2, xmm0);
|
|
psrlw(xmm2, 2);
|
|
|
|
switch(m_sel.abr)
|
|
{
|
|
case 0:
|
|
// g = g.avg8(c[2]);
|
|
pavgb(xmm2, xmm5);
|
|
break;
|
|
case 1:
|
|
// g = g.addus8(c[2]);
|
|
paddusb(xmm2, xmm5);
|
|
break;
|
|
case 2:
|
|
// g = g.subus8(c[2]);
|
|
psubusb(xmm2, xmm5);
|
|
break;
|
|
case 3:
|
|
// g = g.addus8(c[2].srl16(2));
|
|
movdqa(xmm0, xmm5);
|
|
psrlw(xmm0, 2);
|
|
paddusb(xmm2, xmm0);
|
|
break;
|
|
}
|
|
|
|
if(m_sel.tme)
|
|
{
|
|
movdqa(xmm0, xmm3);
|
|
blend8(xmm5, xmm2);
|
|
}
|
|
else
|
|
{
|
|
movdqa(xmm5, xmm2);
|
|
}
|
|
|
|
// GSVector4i b = (d & 0x7c007c00) >> 7;
|
|
|
|
pcmpeqd(xmm0, xmm0);
|
|
psrlw(xmm0, 11);
|
|
psllw(xmm0, 10); // 0x7c00
|
|
movdqa(xmm2, xmm1);
|
|
pand(xmm2, xmm0);
|
|
psrlw(xmm2, 7);
|
|
|
|
switch(m_sel.abr)
|
|
{
|
|
case 0:
|
|
// b = b.avg8(c[2]);
|
|
pavgb(xmm2, xmm6);
|
|
break;
|
|
case 1:
|
|
// b = b.addus8(c[2]);
|
|
paddusb(xmm2, xmm6);
|
|
break;
|
|
case 2:
|
|
// b = b.subus8(c[2]);
|
|
psubusb(xmm2, xmm6);
|
|
break;
|
|
case 3:
|
|
// b = b.addus8(c[2].srl16(2));
|
|
movdqa(xmm0, xmm6);
|
|
psrlw(xmm0, 2);
|
|
paddusb(xmm2, xmm0);
|
|
break;
|
|
}
|
|
|
|
if(m_sel.tme)
|
|
{
|
|
movdqa(xmm0, xmm3);
|
|
blend8(xmm6, xmm2);
|
|
}
|
|
else
|
|
{
|
|
movdqa(xmm6, xmm2);
|
|
}
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::Dither()
|
|
{
|
|
if(!m_sel.dtd)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// c[0] = c[0].addus8(dither);
|
|
// c[1] = c[1].addus8(dither);
|
|
// c[2] = c[2].addus8(dither);
|
|
|
|
movdqa(xmm0, ptr[&m_local.temp.dither]);
|
|
|
|
paddusb(xmm4, xmm0);
|
|
paddusb(xmm5, xmm0);
|
|
paddusb(xmm6, xmm0);
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::WriteFrame()
|
|
{
|
|
// GSVector4i fs = r | g | b | (m_sel.md ? GSVector4i(0x80008000) : m_sel.tme ? a : 0);
|
|
|
|
pcmpeqd(xmm0, xmm0);
|
|
|
|
if(m_sel.md || m_sel.tme)
|
|
{
|
|
movdqa(xmm2, xmm0);
|
|
psllw(xmm2, 15);
|
|
}
|
|
|
|
psrlw(xmm0, 11);
|
|
psllw(xmm0, 3);
|
|
|
|
// xmm0 = 0x00f8
|
|
// xmm2 = 0x8000 (md)
|
|
|
|
// GSVector4i r = (c[0] & 0x00f800f8) >> 3;
|
|
|
|
pand(xmm4, xmm0);
|
|
psrlw(xmm4, 3);
|
|
|
|
// GSVector4i g = (c[1] & 0x00f800f8) << 2;
|
|
|
|
pand(xmm5, xmm0);
|
|
psllw(xmm5, 2);
|
|
por(xmm4, xmm5);
|
|
|
|
// GSVector4i b = (c[2] & 0x00f800f8) << 7;
|
|
|
|
pand(xmm6, xmm0);
|
|
psllw(xmm6, 7);
|
|
por(xmm4, xmm6);
|
|
|
|
if(m_sel.md)
|
|
{
|
|
// GSVector4i a = GSVector4i(0x80008000);
|
|
|
|
por(xmm4, xmm2);
|
|
}
|
|
else if(m_sel.tme)
|
|
{
|
|
// GSVector4i a = (c[3] << 8) & 0x80008000;
|
|
|
|
psllw(xmm3, 8);
|
|
pand(xmm3, xmm2);
|
|
por(xmm4, xmm3);
|
|
}
|
|
|
|
// fs = fs.blend8(fd, test);
|
|
|
|
movdqa(xmm0, xmm7);
|
|
blend8(xmm4, xmm1);
|
|
|
|
// GSVector4i::store<false>(fb, fs);
|
|
|
|
// movdqu(ptr[edi], xmm4);
|
|
|
|
movq(qword[edi], xmm4);
|
|
movhps(qword[edi + 8], xmm4);
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr)
|
|
{
|
|
for(int i = 0; i < 8; i++)
|
|
{
|
|
pextrw(eax, addr, (uint8)i);
|
|
|
|
if(m_sel.tlu) movzx(eax, byte[esi + eax]);
|
|
|
|
const Address& src = m_sel.tlu ? ptr[edx + eax * 2] : ptr[esi + eax * 2];
|
|
|
|
if(i == 0) movd(dst, src);
|
|
else pinsrw(dst, src, (uint8)i);
|
|
}
|
|
}
|
|
|
|
template<int shift>
|
|
void GPUDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f)
|
|
{
|
|
if(shift == 0 && m_cpu.has(util::Cpu::tSSSE3))
|
|
{
|
|
pmulhrsw(a, f);
|
|
}
|
|
else
|
|
{
|
|
psllw(a, shift + 1);
|
|
pmulhw(a, f);
|
|
}
|
|
}
|
|
|
|
template<int shift>
|
|
void GPUDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Operand& f)
|
|
{
|
|
psubw(a, b);
|
|
modulate16<shift>(a, f);
|
|
paddw(a, b);
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::alltrue()
|
|
{
|
|
pmovmskb(eax, xmm7);
|
|
cmp(eax, 0xffff);
|
|
je("step", T_NEAR);
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
|
|
{
|
|
if(m_cpu.has(util::Cpu::tSSE41))
|
|
{
|
|
pblendvb(a, b);
|
|
}
|
|
else
|
|
{
|
|
blend(a, b, xmm0);
|
|
}
|
|
}
|
|
|
|
void GPUDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
|
|
{
|
|
pand(b, mask);
|
|
pandn(mask, a);
|
|
por(b, mask);
|
|
movdqa(a, b);
|
|
}
|
|
|
|
const GSVector4i GPUDrawScanlineCodeGenerator::m_test[8] =
|
|
{
|
|
GSVector4i(0xffff0000, 0xffffffff, 0xffffffff, 0xffffffff),
|
|
GSVector4i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
|
|
GSVector4i(0x00000000, 0xffff0000, 0xffffffff, 0xffffffff),
|
|
GSVector4i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
|
|
GSVector4i(0x00000000, 0x00000000, 0xffff0000, 0xffffffff),
|
|
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffffffff),
|
|
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffff0000),
|
|
GSVector4i::zero(),
|
|
};
|
|
|
|
__aligned(const uint16, 32) GPUDrawScanlineCodeGenerator::m_dither[4][16] =
|
|
{
|
|
{7, 0, 6, 1, 7, 0, 6, 1, 7, 0, 6, 1, 7, 0, 6, 1},
|
|
{2, 5, 3, 4, 2, 5, 3, 4, 2, 5, 3, 4, 2, 5, 3, 4},
|
|
{1, 6, 0, 7, 1, 6, 0, 7, 1, 6, 0, 7, 1, 6, 0, 7},
|
|
{4, 3, 5, 2, 4, 3, 5, 2, 4, 3, 5, 2, 4, 3, 5, 2},
|
|
};
|