mirror of
https://github.com/libretro/pcsx2.git
synced 2024-11-24 17:59:47 +00:00
GSdx: The sw renderer now uses avx2, not much faster though, +10% maybe, if the game is not EE limited. I'm not sure if haswell has that much better sse execution (load/store units doubled for example), or the avx2 code is not fully optimized yet.
git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5677 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
3b753bec42
commit
d20bd4f86a
@ -1789,23 +1789,39 @@ public:
|
||||
GSVector8i TA0(TEXA.TA0 << 24);
|
||||
GSVector8i mask = GSVector8i::x00ffffff();
|
||||
|
||||
for(int i = 0; i < 4; i++, dst += dstpitch * 2)
|
||||
{
|
||||
GSVector8i v0 = s[i * 2 + 0];
|
||||
GSVector8i v1 = s[i * 2 + 1];
|
||||
GSVector8i v0, v1, v2, v3;
|
||||
|
||||
GSVector8i::sw128(v0, v1);
|
||||
GSVector8i::sw64(v0, v1);
|
||||
v0 = s[0] & mask;
|
||||
v1 = s[1] & mask;
|
||||
v2 = s[2] & mask;
|
||||
v3 = s[3] & mask;
|
||||
|
||||
v0 &= mask;
|
||||
v1 &= mask;
|
||||
GSVector8i::sw128(v0, v1);
|
||||
GSVector8i::sw64(v0, v1);
|
||||
GSVector8i::sw128(v2, v3);
|
||||
GSVector8i::sw64(v2, v3);
|
||||
|
||||
GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
|
||||
GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1];
|
||||
*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
|
||||
*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
|
||||
*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
|
||||
*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);
|
||||
|
||||
d0[0] = Expand24to32<AEM>(v0, TA0);
|
||||
d1[0] = Expand24to32<AEM>(v1, TA0);
|
||||
}
|
||||
v0 = s[4] & mask;
|
||||
v1 = s[5] & mask;
|
||||
v2 = s[6] & mask;
|
||||
v3 = s[7] & mask;
|
||||
|
||||
GSVector8i::sw128(v0, v1);
|
||||
GSVector8i::sw64(v0, v1);
|
||||
GSVector8i::sw128(v2, v3);
|
||||
GSVector8i::sw64(v2, v3);
|
||||
|
||||
dst += dstpitch * 4;
|
||||
|
||||
*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
|
||||
*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
|
||||
*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
|
||||
*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);
|
||||
|
||||
#else
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -81,8 +81,8 @@ public:
|
||||
bool IsEdge() const {return m_global.sel.aa1;}
|
||||
bool IsRect() const {return m_global.sel.IsSolidRect();}
|
||||
|
||||
bool TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga);
|
||||
void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm);
|
||||
template<class T> bool TestAlpha(T& test, T& fm, T& zm, const T& ga);
|
||||
template<class T> void WritePixel(const T& src, int addr, int i, uint32 psm);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -22,6 +22,38 @@
|
||||
#include "stdafx.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
const GSVector8i GSDrawScanlineCodeGenerator::m_test[16] =
|
||||
{
|
||||
GSVector8i::zero(),
|
||||
GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
|
||||
GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
|
||||
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
|
||||
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
|
||||
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000),
|
||||
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
|
||||
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
|
||||
GSVector8i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
|
||||
GSVector8i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
|
||||
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
|
||||
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
|
||||
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
|
||||
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
|
||||
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff),
|
||||
GSVector8i::zero(),
|
||||
};
|
||||
|
||||
const GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4] =
|
||||
{
|
||||
GSVector8(0.204446009836232697516f),
|
||||
GSVector8(-1.04913055217340124191f),
|
||||
GSVector8(2.28330284476918490682f),
|
||||
GSVector8(1.0f),
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] =
|
||||
{
|
||||
GSVector4i::zero(),
|
||||
@ -42,6 +74,8 @@ const GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4] =
|
||||
GSVector4(1.0f),
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
@ -51,6 +85,81 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
|
||||
Generate();
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
void GSDrawScanlineCodeGenerator::modulate16(const Ymm& a, const Operand& f, int shift)
|
||||
{
|
||||
if(shift == 0)
|
||||
{
|
||||
vpmulhrsw(a, f);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpsllw(a, (uint8)(shift + 1));
|
||||
vpmulhw(a, f);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift)
|
||||
{
|
||||
vpsubw(a, b);
|
||||
modulate16(a, f, shift);
|
||||
vpaddw(a, b);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f)
|
||||
{
|
||||
vpsubw(a, b);
|
||||
vpmullw(a, f);
|
||||
vpsraw(a, 4);
|
||||
vpaddw(a, b);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::mix16(const Ymm& a, const Ymm& b, const Ymm& temp)
|
||||
{
|
||||
vpblendw(a, b, 0xaa);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp)
|
||||
{
|
||||
vpackuswb(a, a);
|
||||
vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
|
||||
vpmovzxbw(a, a);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::alltrue()
|
||||
{
|
||||
vpmovmskb(eax, ymm7);
|
||||
cmp(eax, 0xffffffff);
|
||||
je("step", T_NEAR);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend(const Ymm& a, const Ymm& b, const Ymm& mask)
|
||||
{
|
||||
vpand(b, mask);
|
||||
vpandn(mask, a);
|
||||
vpor(a, b, mask);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blendr(const Ymm& b, const Ymm& a, const Ymm& mask)
|
||||
{
|
||||
vpand(b, mask);
|
||||
vpandn(mask, a);
|
||||
vpor(b, mask);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend8(const Ymm& a, const Ymm& b)
|
||||
{
|
||||
vpblendvb(a, a, b, xmm0);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
|
||||
{
|
||||
vpblendvb(b, a, b, xmm0);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
|
||||
{
|
||||
#if _M_SSE >= 0x500
|
||||
@ -244,3 +353,5 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
@ -35,6 +35,55 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
|
||||
|
||||
void Generate();
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const Ymm& temp1, const Ymm& temp2);
|
||||
void SampleTexture();
|
||||
void Wrap(const Ymm& uv0);
|
||||
void Wrap(const Ymm& uv0, const Ymm& uv1);
|
||||
void SampleTextureLOD();
|
||||
void WrapLOD(const Ymm& uv0);
|
||||
void WrapLOD(const Ymm& uv0, const Ymm& uv1);
|
||||
void AlphaTFX();
|
||||
void ReadMask();
|
||||
void TestAlpha();
|
||||
void ColorTFX();
|
||||
void Fog();
|
||||
void ReadFrame();
|
||||
void TestDestAlpha();
|
||||
void WriteMask();
|
||||
void WriteZBuf();
|
||||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
|
||||
#if defined(_M_AMD64) || defined(_WIN64)
|
||||
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr);
|
||||
void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
|
||||
void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
|
||||
#else
|
||||
void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
|
||||
void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
|
||||
void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
|
||||
#endif
|
||||
|
||||
void ReadTexel(int pixels, int mip_offset = 0);
|
||||
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
|
||||
|
||||
void modulate16(const Ymm& a, const Operand& f, int shift);
|
||||
void lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift);
|
||||
void lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f);
|
||||
void mix16(const Ymm& a, const Ymm& b, const Ymm& temp);
|
||||
void clamp16(const Ymm& a, const Ymm& temp);
|
||||
void alltrue();
|
||||
void blend(const Ymm& a, const Ymm& b, const Ymm& mask);
|
||||
void blendr(const Ymm& b, const Ymm& a, const Ymm& mask);
|
||||
void blend8(const Ymm& a, const Ymm& b);
|
||||
void blend8r(const Ymm& b, const Ymm& a);
|
||||
|
||||
#else
|
||||
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const Xmm& temp1, const Xmm& temp2);
|
||||
@ -80,9 +129,17 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
|
||||
void blend8(const Xmm& a, const Xmm& b);
|
||||
void blend8r(const Xmm& b, const Xmm& a);
|
||||
|
||||
#endif
|
||||
|
||||
public:
|
||||
GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
static const GSVector8i m_test[16];
|
||||
static const GSVector8 m_log2_coef[4];
|
||||
#else
|
||||
static const GSVector4i m_test[8];
|
||||
static const GSVector4 m_log2_coef[4];
|
||||
#endif
|
||||
|
||||
};
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 16;
|
||||
static const int _top = _args + 4;
|
||||
@ -1236,24 +1236,21 @@ return;
|
||||
|
||||
// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]
|
||||
|
||||
vmovq(xmm4, ptr[&m_local.gd->t.minmax]); // x x x x maxv maxu minv minu
|
||||
vpunpcklwd(xmm4, xmm4); // maxv maxv maxu maxu minv minv minu minu
|
||||
|
||||
vpxor(xmm1, xmm1);
|
||||
|
||||
vpunpckldq(xmm6, xmm4, xmm4); // minv minv minv minv minu minu minu minu
|
||||
vpunpcklwd(xmm5, xmm6, xmm1); // 0 minu 0 minu 0 minu 0 minu
|
||||
|
||||
vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
|
||||
vpunpcklwd(xmm5, xmm4, xmm1); // minu
|
||||
vpunpckhwd(xmm6, xmm4, xmm1); // minv
|
||||
vpsrlvd(xmm5, xmm5, xmm0);
|
||||
vpunpckhwd(xmm6, xmm6, xmm1); // 0 minv 0 minv 0 minv 0 minv
|
||||
vpsrlvd(xmm6, xmm6, xmm0);
|
||||
vpackusdw(xmm5, xmm6); // xmm5 = minv minv minv minv minu minu minu minu
|
||||
|
||||
vpunpckhdq(xmm4, xmm4); // maxv maxv maxv maxv maxu maxu maxu maxu
|
||||
vpunpcklwd(xmm6, xmm4, xmm1); // 0 maxu 0 maxu 0 maxu 0 maxu
|
||||
vpackusdw(xmm5, xmm6);
|
||||
|
||||
vmovdqa(xmm4, ptr[&m_local.gd->t.max]);
|
||||
vpunpcklwd(xmm6, xmm4, xmm1); // maxu
|
||||
vpunpckhwd(xmm4, xmm4, xmm1); // maxv
|
||||
vpsrlvd(xmm6, xmm6, xmm0);
|
||||
vpunpckhwd(xmm4, xmm1); // 0 maxv 0 maxv 0 maxv 0 maxv
|
||||
vpsrlvd(xmm4, xmm4, xmm0);
|
||||
vpackusdw(xmm6, xmm4); // xmm6 = maxv maxv maxv maxv maxu maxu maxu maxu
|
||||
vpackusdw(xmm6, xmm4);
|
||||
|
||||
vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5);
|
||||
vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6);
|
||||
@ -2807,7 +2804,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
|
||||
}
|
||||
}
|
||||
|
||||
static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
static const int s_offsets[] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
|
||||
{
|
||||
@ -2865,7 +2862,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
||||
vmovdqa(ptr[&m_local.temp.test], xmm7);
|
||||
}
|
||||
|
||||
for(int j = 0; j < 4; j++)
|
||||
for(uint8 j = 0; j < 4; j++)
|
||||
{
|
||||
mov(ebx, ptr[&lod_i->u32[j]]);
|
||||
mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
|
||||
@ -2895,18 +2892,9 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
||||
|
||||
for(int i = 0; i < pixels; i++)
|
||||
{
|
||||
if(m_cpu.has(util::Cpu::tAVX2) && !m_sel.tlu) // vpgatherdd seems to be dead slow for byte aligned offsets, not using it for palette lookups
|
||||
for(uint8 j = 0; j < 4; j++)
|
||||
{
|
||||
Xmm mask = Xmm(t[i]);
|
||||
vpcmpeqd(mask, mask);
|
||||
vpgatherdd(Xmm(r[i * 2 + 1]), ptr[ebx + Xmm(r[i * 2 + 0]) * 4], mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int j = 0; j < 4; j++)
|
||||
{
|
||||
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
|
||||
}
|
||||
ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2914,6 +2902,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
ASSERT(i < 4);
|
||||
|
||||
const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];
|
||||
|
||||
if(i == 0) vmovd(eax, addr);
|
||||
|
2968
plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
Normal file
2968
plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -209,7 +209,7 @@ public:
|
||||
|
||||
iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &ml);
|
||||
/*
|
||||
name = format("c:/temp/%s_%016llx.bin", m_name.c_str(), (uint64)key);
|
||||
name = format("c:/temp1/%s_%016llx.bin", m_name.c_str(), (uint64)key);
|
||||
|
||||
if(FILE* fp = fopen(name.c_str(), "wb"))
|
||||
{
|
||||
@ -218,7 +218,7 @@ public:
|
||||
fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
|
||||
|
||||
fwrite(cg->getCode(), cg->getSize(), 1, fp);
|
||||
|
||||
|
||||
fputc(0xBB, fp); fputc(0xDE, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp);
|
||||
fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
|
||||
fputc(0x0F, fp); fputc(0x0B, fp);
|
||||
|
@ -208,6 +208,10 @@ void GSRasterizer::Draw(GSRasterizerData* data)
|
||||
__assume(0);
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
_mm256_zeroupper();
|
||||
#endif
|
||||
|
||||
data->pixels = m_pixels;
|
||||
|
||||
uint64 ticks = __rdtsc() - data->start;
|
||||
@ -917,7 +921,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
|
||||
{
|
||||
for(int i = 0; i < threads; i++, row++)
|
||||
{
|
||||
m_scanline[row] = i;
|
||||
m_scanline[row] = (uint8)i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -400,7 +400,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*
|
||||
|
||||
dst->t = t;
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#if 0 //_M_SSE >= 0x501
|
||||
|
||||
dst->_pad = GSVector4::zero();
|
||||
|
||||
@ -1342,8 +1342,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
||||
{
|
||||
gd.sel.fge = 1;
|
||||
|
||||
gd.frb = GSVector4i((int)env.FOGCOL.u32[0] & 0x00ff00ff);
|
||||
gd.fga = GSVector4i((int)(env.FOGCOL.u32[0] >> 8) & 0x00ff00ff);
|
||||
gd.frb = env.FOGCOL.u32[0] & 0x00ff00ff;
|
||||
gd.fga = (env.FOGCOL.u32[0] >> 8) & 0x00ff00ff;
|
||||
}
|
||||
|
||||
if(context->FRAME.PSM != PSM_PSMCT24)
|
||||
@ -1403,6 +1403,34 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
||||
gd.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
gd.fm = fm;
|
||||
gd.zm = zm;
|
||||
|
||||
if(gd.sel.fpsm == 1)
|
||||
{
|
||||
gd.fm |= 0xff000000;
|
||||
}
|
||||
else if(gd.sel.fpsm == 2)
|
||||
{
|
||||
uint32 rb = gd.fm & 0x00f800f8;
|
||||
uint32 ga = gd.fm & 0x8000f800;
|
||||
|
||||
gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
|
||||
}
|
||||
|
||||
if(gd.sel.zpsm == 1)
|
||||
{
|
||||
gd.zm |= 0xff000000;
|
||||
}
|
||||
else if(gd.sel.zpsm == 2)
|
||||
{
|
||||
gd.zm |= 0xffff0000;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
gd.fm = GSVector4i(fm);
|
||||
gd.zm = GSVector4i(zm);
|
||||
|
||||
@ -1427,6 +1455,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
||||
gd.zm |= GSVector4i::xffff0000();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) // TODO: check scissor horizontally only
|
||||
{
|
||||
gd.sel.notest = 1;
|
||||
@ -1435,7 +1465,11 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
|
||||
|
||||
for(int i = 0, j = m_vertex.tail; i < j; i++)
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8
|
||||
#else
|
||||
if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4
|
||||
#endif
|
||||
{
|
||||
gd.sel.notest = 0;
|
||||
|
||||
|
@ -116,7 +116,7 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
|
||||
void* vm;
|
||||
const void* tex[7];
|
||||
uint32* clut;
|
||||
GSVector4i* dimx;
|
||||
GSVector4i* dimx;
|
||||
|
||||
const int* fbr;
|
||||
const int* zbr;
|
||||
@ -125,19 +125,63 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
|
||||
const GSVector2i* fzbr;
|
||||
const GSVector2i* fzbc;
|
||||
|
||||
GSVector4i fm, zm;
|
||||
struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
|
||||
GSVector4i aref;
|
||||
GSVector4i afix;
|
||||
struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
uint32 fm, zm;
|
||||
uint32 frb, fga;
|
||||
GSVector8 mxl;
|
||||
GSVector8 k; // TEX1.K * 0x10000
|
||||
GSVector8 l; // TEX1.L * -0x10000
|
||||
struct {GSVector8i i, f;} lod; // lcm == 1
|
||||
|
||||
#else
|
||||
|
||||
GSVector4i fm, zm;
|
||||
GSVector4i frb, fga;
|
||||
GSVector4 mxl;
|
||||
GSVector4 k; // TEX1.K * 0x10000
|
||||
GSVector4 l; // TEX1.L * -0x10000
|
||||
struct {GSVector4i i, f;} lod; // lcm == 1
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
__aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has its own
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
|
||||
struct step {GSVector8 z, stq; GSVector8i c, f;} d8;
|
||||
struct {GSVector8i rb, ga;} c;
|
||||
struct {uint32 z, f;} p;
|
||||
|
||||
// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
|
||||
|
||||
struct
|
||||
{
|
||||
GSVector8 z, zo;
|
||||
GSVector8i f;
|
||||
GSVector8 s, t, q;
|
||||
GSVector8i rb, ga;
|
||||
GSVector8i zs, zd;
|
||||
GSVector8i uf, vf;
|
||||
GSVector8i cov;
|
||||
|
||||
// mipmapping
|
||||
|
||||
struct {GSVector8i i, f;} lod;
|
||||
GSVector8i uv[2];
|
||||
GSVector8i uv_minmax[2];
|
||||
GSVector8i trb, tga;
|
||||
GSVector8i test;
|
||||
} temp;
|
||||
|
||||
#else
|
||||
|
||||
struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4];
|
||||
struct step {GSVector4 z, stq; GSVector4i c, f;} d4;
|
||||
struct {GSVector4i rb, ga;} c;
|
||||
@ -164,6 +208,8 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has
|
||||
GSVector4i test;
|
||||
} temp;
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
const GSScanlineGlobalData* gd;
|
||||
|
@ -22,6 +22,23 @@
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
const GSVector8 GSSetupPrimCodeGenerator::m_shift[9] =
|
||||
{
|
||||
GSVector8(8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f),
|
||||
GSVector8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
|
||||
GSVector8(-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f),
|
||||
GSVector8(-2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f),
|
||||
GSVector8(-3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f),
|
||||
GSVector8(-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f),
|
||||
GSVector8(-5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f),
|
||||
GSVector8(-6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f),
|
||||
GSVector8(-7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f),
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
|
||||
{
|
||||
GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
|
||||
@ -31,6 +48,8 @@ const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
|
||||
GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
|
@ -42,5 +42,9 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
|
||||
public:
|
||||
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
static const GSVector8 m_shift[9];
|
||||
#else
|
||||
static const GSVector4 m_shift[5];
|
||||
#endif
|
||||
};
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
|
353
plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp
Normal file
353
plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp
Normal file
@ -0,0 +1,353 @@
|
||||
/*
|
||||
* Copyright (C) 2007-2009 Gabest
|
||||
* http://www.gabest.org
|
||||
*
|
||||
* This Program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This Program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNU Make; see the file COPYING. If not, write to
|
||||
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
*/
|
||||
|
||||
#include "stdafx.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate()
|
||||
{
|
||||
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Ymm(3 + i), ptr[&m_shift[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth()
|
||||
{
|
||||
if(!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
|
||||
|
||||
vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
|
||||
|
||||
vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]);
|
||||
}
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
|
||||
|
||||
vmulps(ymm0, ymm1, ymm3);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d8.f], ymm0);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// m_local.d8.z = dz * shift[0];
|
||||
|
||||
vmulps(ymm0, ymm2, ymm3);
|
||||
vmovaps(ptr[&m_local.d8.z], ymm0);
|
||||
}
|
||||
|
||||
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
if(i < 4) vmulps(ymm0, ymm1, Ymm(4 + i));
|
||||
else vmulps(ymm0, ymm1, ptr[&m_shift[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], ymm0);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
|
||||
vmovaps(ptr[&m_local.d[i].z], ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
if(m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpextrd(ptr[&m_local.p.f], xmm0, 3);
|
||||
}
|
||||
|
||||
if(m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
|
||||
mov(ptr[&m_local.p.z], eax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture()
|
||||
{
|
||||
if(!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector8 dt(dscan.t);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
// GSVector8 dt8 = dt * shift[0];
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
|
||||
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
|
||||
vmovdqa(ptr[&m_local.d8.stq], ymm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d8.stq = dt8;
|
||||
|
||||
vmovaps(ptr[&m_local.d8.stq], ymm1);
|
||||
}
|
||||
|
||||
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8 v = dstq * shift[1 + i];
|
||||
|
||||
if(i < 4) vmulps(ymm2, ymm1, Ymm(4 + i));
|
||||
else vmulps(ymm2, ymm1, ptr[&m_shift[i + 1]]);
|
||||
|
||||
if(m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
|
||||
|
||||
vcvttps2dq(ymm2, ymm2);
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch(j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color()
|
||||
{
|
||||
if(!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if(m_sel.iip)
|
||||
{
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(ymm1, ymm1);
|
||||
vmovdqa(ptr[&m_local.d8.c], ymm1);
|
||||
|
||||
// ymm3 is not needed anymore
|
||||
|
||||
// GSVector8 dr = dc.xxxx();
|
||||
// GSVector8 db = dc.zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
|
||||
|
||||
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
|
||||
|
||||
if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
|
||||
else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], ymm0);
|
||||
}
|
||||
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector8 dg = dc.yyyy();
|
||||
// GSVector8 da = dc.wwww();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
|
||||
|
||||
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
|
||||
|
||||
if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
|
||||
else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], ymm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch(m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if(m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(ymm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], ymm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], ymm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -75,6 +75,7 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000
|
||||
|
||||
#if _M_SSE >= 0x500
|
||||
|
||||
const GSVector8 GSVector8::m_half(0.5f);
|
||||
const GSVector8 GSVector8::m_one(1.0f);
|
||||
const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
|
||||
const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
|
||||
|
@ -4119,6 +4119,15 @@ public:
|
||||
|
||||
// TODO: extract/insert
|
||||
|
||||
template<int i> __forceinline int extract32() const
|
||||
{
|
||||
GSVector4i v = extract<i / 4>();
|
||||
|
||||
if((i & 3) == 0) return GSVector4i::store(v);
|
||||
|
||||
return v.extract32<i>();
|
||||
}
|
||||
|
||||
template<int i> __forceinline GSVector4i extract() const
|
||||
{
|
||||
if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
|
||||
@ -4141,7 +4150,6 @@ public:
|
||||
GSVector4i a0 = extract<0>();
|
||||
GSVector4i a1 = extract<1>();
|
||||
|
||||
|
||||
v0 = GSVector4i::load((int)ptr[a0.extract32<0>()]);
|
||||
v0 = v0.insert32<1>((int)ptr[a0.extract32<1>()]);
|
||||
v0 = v0.insert32<2>((int)ptr[a0.extract32<2>()]);
|
||||
@ -4191,14 +4199,14 @@ public:
|
||||
return cast(v0).insert<1>(v1);
|
||||
}
|
||||
|
||||
template<> __forceinline GSVector8i gather32_32<uint32, uint8>(const uint32* ptr1, const uint8* ptr2) const
|
||||
template<> __forceinline GSVector8i gather32_32<uint8, uint32>(const uint8* ptr1, const uint32* ptr2) const
|
||||
{
|
||||
return gather32_32<uint8>(ptr2).gather32_32<uint32>(ptr1);
|
||||
return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2);
|
||||
}
|
||||
|
||||
template<> __forceinline GSVector8i gather32_32<uint32, uint32>(const uint32* ptr1, const uint32* ptr2) const
|
||||
{
|
||||
return gather32_32<uint32>(ptr2).gather32_32<uint32>(ptr1);
|
||||
return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2);
|
||||
}
|
||||
|
||||
template<class T> __forceinline void gather32_32(const T* RESTRICT ptr, GSVector8i* RESTRICT dst) const
|
||||
@ -4731,6 +4739,16 @@ public:
|
||||
return GSVector8i(_mm256_broadcastq_epi64(v.m));
|
||||
}
|
||||
|
||||
__forceinline static GSVector8i broadcast128(const GSVector4i& v)
|
||||
{
|
||||
// this one only has m128 source op, it will be saved to a temp on stack if the compiler is not smart enough and use the address of v directly (<= vs2012u3rc2)
|
||||
|
||||
return GSVector8i(_mm256_broadcastsi128_si256(v)); // fastest
|
||||
//return GSVector8i(v); // almost as fast as broadcast
|
||||
//return cast(v).insert<1>(v); // slow
|
||||
//return cast(v).aa(); // slowest
|
||||
}
|
||||
|
||||
__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}
|
||||
|
||||
__forceinline static GSVector8i xffffffff() {return zero() == zero();}
|
||||
@ -4958,6 +4976,7 @@ public:
|
||||
__m128 m0, m1;
|
||||
};
|
||||
|
||||
static const GSVector8 m_half;
|
||||
static const GSVector8 m_one;
|
||||
static const GSVector8 m_x7fffffff;
|
||||
static const GSVector8 m_x80000000;
|
||||
|
@ -646,6 +646,18 @@
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
|
||||
@ -737,6 +749,7 @@
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" />
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
|
||||
@ -2054,4 +2067,4 @@
|
||||
<UserProperties RESOURCE_FILE="GSdx.rc" />
|
||||
</VisualStudio>
|
||||
</ProjectExtensions>
|
||||
</Project>
|
||||
</Project>
|
@ -336,6 +336,12 @@
|
||||
<ClCompile Include="GSRendererCS.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="GLLoader.h">
|
||||
@ -728,4 +734,4 @@
|
||||
<Filter>Resource Files</Filter>
|
||||
</ResourceCompile>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
@ -789,22 +789,22 @@ void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(x
|
||||
void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, true, -1); }
|
||||
void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, true, -1); }
|
||||
void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, true, -1); }
|
||||
void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); }
|
||||
void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); }
|
||||
void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); }
|
||||
void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); }
|
||||
void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); }
|
||||
void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); }
|
||||
void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); }
|
||||
void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); }
|
||||
void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); }
|
||||
void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); }
|
||||
void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); }
|
||||
void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); }
|
||||
void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); }
|
||||
void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); }
|
||||
void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); }
|
||||
void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); }
|
||||
void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, true, -1); }
|
||||
void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, true, -1); }
|
||||
void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, true, -1); }
|
||||
void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, true, -1); }
|
||||
void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, true, -1); }
|
||||
void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, true, -1); }
|
||||
void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, true, -1); }
|
||||
void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, true, -1); }
|
||||
void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, true, -1); }
|
||||
void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, true, -1); }
|
||||
void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, true, -1); }
|
||||
void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, true, -1); }
|
||||
void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, true, -1); }
|
||||
void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, true, -1); }
|
||||
void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, true, -1); }
|
||||
void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, true, -1); }
|
||||
void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, true, -1); }
|
||||
void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, true, -1); }
|
||||
void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, true, -1); }
|
||||
@ -1345,8 +1345,8 @@ void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
|
||||
void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
|
||||
void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
|
||||
void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
|
||||
void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
|
||||
void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
|
||||
void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
|
||||
void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
|
||||
void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
|
||||
void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
|
||||
void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
|
||||
@ -1410,3 +1410,13 @@ void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1
|
||||
void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 0, 2); }
|
||||
void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 1, 0); }
|
||||
void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 1, 1); }
|
||||
|
||||
// mods
|
||||
|
||||
void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x78, true, 0); }
|
||||
void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x79, true, 0); }
|
||||
void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x58, true, 0); }
|
||||
void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x59, true, 0); }
|
||||
|
||||
// supportYMM = true
|
||||
// vpblendvb, vpsllw-vpsrlq
|
||||
|
Loading…
Reference in New Issue
Block a user