This commit is contained in:
gabest
2009-01-04 20:17:05 +00:00
parent 6e880a6b07
commit f43f766ed7
17 changed files with 2318 additions and 2394 deletions

View File

@@ -35,7 +35,7 @@ GPUDrawScanline::~GPUDrawScanline()
// IDrawScanline
void GPUDrawScanline::BeginDraw(const GSRasterizerData* data, DrawScanlinePtr* dsf, DrawSolidRectPtr* dsrf)
void GPUDrawScanline::BeginDraw(const GSRasterizerData* data, Functions* f)
{
GPUDrawingEnvironment& env = m_state->m_env;
@@ -71,8 +71,8 @@ void GPUDrawScanline::BeginDraw(const GSRasterizerData* data, DrawScanlinePtr* d
m_env.a = GSVector4i(env.PRIM.ABE ? 0xffffffff : 0);
m_env.md = GSVector4i(env.STATUS.MD ? 0x80008000 : 0);
*dsf = m_ds[m_env.sel];
*dsrf = NULL; // TODO
f->sl = m_ds[m_env.sel];
f->sr = NULL; // TODO
}
void GPUDrawScanline::SetupPrim(GS_PRIM_CLASS primclass, const GSVertexSW* vertices, const GSVertexSW& dscan)
@@ -118,7 +118,7 @@ void GPUDrawScanline::SetupPrim(GS_PRIM_CLASS primclass, const GSVertexSW* verti
}
void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin, GSVector4i& test, const GSVector4i& s, const GSVector4i& t, GSVector4i* c)
void GPUDrawScanline::SampleTexture(DWORD ltf, DWORD tlu, DWORD twin, GSVector4i& test, const GSVector4i& s, const GSVector4i& t, GSVector4i* c)
{
const void* RESTRICT tex = m_env.tex;
const WORD* RESTRICT clut = m_env.clut;
@@ -159,8 +159,6 @@ void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin
GSVector4i c00, c01, c10, c11;
#if _M_SSE >= 0x401
if(tlu)
{
c00 = addr00.gather16_16((const BYTE*)tex, clut);
@@ -176,45 +174,6 @@ void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin
c11 = addr01.gather16_16((const WORD*)tex);
}
#else
int i = 0;
if(tlu)
{
do
{
if(test.u16[i]) // me &&
{
continue;
}
c00.u16[i] = clut[((const BYTE*)tex)[addr00.u16[i]]];
c01.u16[i] = clut[((const BYTE*)tex)[addr01.u16[i]]];
c10.u16[i] = clut[((const BYTE*)tex)[addr10.u16[i]]];
c11.u16[i] = clut[((const BYTE*)tex)[addr11.u16[i]]];
}
while(++i < pixels);
}
else
{
do
{
if(test.u16[i]) // me &&
{
continue;
}
c00.u16[i] = ((const WORD*)tex)[addr00.u16[i]];
c01.u16[i] = ((const WORD*)tex)[addr01.u16[i]];
c10.u16[i] = ((const WORD*)tex)[addr10.u16[i]];
c11.u16[i] = ((const WORD*)tex)[addr11.u16[i]];
}
while(++i < pixels);
}
#endif
GSVector4i r00 = (c00 & 0x001f001f) << 3;
GSVector4i r01 = (c01 & 0x001f001f) << 3;
GSVector4i r10 = (c10 & 0x001f001f) << 3;
@@ -279,8 +238,6 @@ void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin
GSVector4i c00;
#if _M_SSE >= 0x401
if(tlu)
{
c00 = addr.gather16_16((const BYTE*)tex, clut);
@@ -290,39 +247,6 @@ void GPUDrawScanline::SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin
c00 = addr.gather16_16((const WORD*)tex);
}
#else
int i = 0;
if(tlu)
{
do
{
if(test.u16[i]) // me &&
{
continue;
}
c00.u16[i] = clut[((const BYTE*)tex)[addr.u16[i]]];
}
while(++i < pixels);
}
else
{
do
{
if(test.u16[i]) // me &&
{
continue;
}
c00.u16[i] = ((const WORD*)tex)[addr.u16[i]];
}
while(++i < pixels);
}
#endif
test |= c00.eq16(GSVector4i::zero()); // mask out blank pixels
c[0] = (c00 & 0x001f001f) << 3;
@@ -767,7 +691,7 @@ void GPUDrawScanline::DrawScanline(int top, int left, int right, const GSVertexS
if(m_env.sel.tme)
{
SampleTexture(pixels, m_env.sel.ltf, m_env.sel.tlu, m_env.sel.twin, test, s, t, c);
SampleTexture(m_env.sel.ltf, m_env.sel.tlu, m_env.sel.twin, test, s, t, c);
}
ColorTFX(m_env.sel.tfx, r, g, b, c);
@@ -890,7 +814,7 @@ void GPUDrawScanline::DrawScanlineEx(int top, int left, int right, const GSVerte
if(tme)
{
SampleTexture(pixels, m_env.sel.ltf, m_env.sel.tlu, twin, test, s, t, c);
SampleTexture(m_env.sel.ltf, m_env.sel.tlu, twin, test, s, t, c);
}
ColorTFX(tfx, r, g, b, c);

View File

@@ -94,7 +94,7 @@ class GPUDrawScanline : public GSAlignedClass<16>, public IDrawScanline
template<DWORD sel>
void DrawScanlineEx(int top, int left, int right, const GSVertexSW& v);
__forceinline void SampleTexture(int pixels, DWORD ltf, DWORD tlu, DWORD twin, GSVector4i& test, const GSVector4i& s, const GSVector4i& t, GSVector4i* c);
__forceinline void SampleTexture(DWORD ltf, DWORD tlu, DWORD twin, GSVector4i& test, const GSVector4i& s, const GSVector4i& t, GSVector4i* c);
__forceinline void ColorTFX(DWORD tfx, const GSVector4i& r, const GSVector4i& g, const GSVector4i& b, GSVector4i* c);
__forceinline void AlphaBlend(UINT32 abr, UINT32 tme, const GSVector4i& d, GSVector4i* c);
__forceinline void WriteFrame(WORD* RESTRICT fb, const GSVector4i& test, const GSVector4i* c, int pixels);
@@ -109,7 +109,7 @@ public:
// IDrawScanline
void BeginDraw(const GSRasterizerData* data, DrawScanlinePtr* dsf, DrawSolidRectPtr* dsrf);
void BeginDraw(const GSRasterizerData* data, Functions* f);
void EndDraw(const GSRasterizerStats& stats) {}
void SetupPrim(GS_PRIM_CLASS primclass, const GSVertexSW* vertices, const GSVertexSW& dscan);
void PrintStats() {}

View File

@@ -1357,20 +1357,7 @@ public:
{
for(int j = 0; j < 16; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
s[j].gather32_8(pal, (GSVector4i*)dst);
#else
for(int i = 0; i < 16; i++)
{
((DWORD*)dst)[i] = pal[src[j * 16 + i]];
}
#endif
((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst);
}
}
@@ -1378,20 +1365,7 @@ public:
{
for(int j = 0; j < 16; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
s[j].gather16_8(pal, (GSVector4i*)dst);
#else
for(int i = 0; i < 16; i++)
{
((WORD*)dst)[i] = (WORD)pal[src[j * 16 + i]];
}
#endif
((const GSVector4i*)src)[j].gather16_8(pal, (GSVector4i*)dst);
}
}
@@ -1399,20 +1373,7 @@ public:
{
for(int j = 0; j < 16; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
s[j].gather64_8(pal, (GSVector4i*)dst);
#else
for(int i = 0; i < 32 / 2; i++)
{
((UINT64*)dst)[i] = pal[src[j * 16 + i]];
}
#endif
((const GSVector4i*)src)[j].gather64_8(pal, (GSVector4i*)dst);
}
}
@@ -1420,20 +1381,7 @@ public:
{
for(int j = 0; j < 16; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
s[j].gather32_8(pal, (GSVector4i*)dst);
#else
for(int i = 0; i < 32 / 2; i++)
{
((DWORD*)dst)[i] = (DWORD)pal[src[j * 16 + i]];
}
#endif
((const GSVector4i*)src)[j].gather32_8(pal, (GSVector4i*)dst);
}
}
@@ -1441,21 +1389,10 @@ public:
{
for(int j = 0; j < 8; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 24).gather32_32<>(pal);
((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 24).gather32_32<>(pal);
#else
for(int i = 0; i < 8; i++)
{
((DWORD*)dst)[i] = pal[src[j * 8 + i] >> 24];
}
#endif
}
}
@@ -1487,21 +1424,10 @@ public:
{
for(int j = 0; j < 8; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
((GSVector4i*)dst)[0] = ((s[j * 2 + 0] >> 24) & 0xf).gather32_32<>(pal);
((GSVector4i*)dst)[1] = ((s[j * 2 + 1] >> 24) & 0xf).gather32_32<>(pal);
#else
for(int i = 0; i < 8; i++)
{
((DWORD*)dst)[i] = pal[(src[j * 8 + i] >> 24) & 0xf];
}
#endif
}
}
@@ -1533,23 +1459,12 @@ public:
{
for(int j = 0; j < 8; j++, dst += dstpitch)
{
#if _M_SSE >= 0x401
const GSVector4i* s = (const GSVector4i*)src;
((GSVector4i*)dst)[0] = (s[j * 2 + 0] >> 28).gather32_32<>(pal);
((GSVector4i*)dst)[1] = (s[j * 2 + 1] >> 28).gather32_32<>(pal);
#else
for(int i = 0; i < 8; i++)
{
((DWORD*)dst)[i] = pal[src[j * 8 + i] >> 28];
}
#endif
}
}
}
__forceinline static void ExpandBlock4HH_16(DWORD* RESTRICT src, BYTE* RESTRICT dst, int dstpitch, const DWORD* RESTRICT pal)
{

View File

@@ -256,6 +256,7 @@ void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
m_read.TEX0 = TEX0;
m_read.TEXA = TEXA;
m_read.dirty = false;
m_read.adirty = true;
WORD* clut = m_clut + (TEX0.CSA << 4);
@@ -296,6 +297,60 @@ void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
}
}
void GSClut::GetAlphaMinMax32(int& amin, int& amax)
{
// call only after Read32
ASSERT(!m_read.dirty);
if(m_read.adirty)
{
m_read.adirty = false;
// DWORD bpp = GSLocalMemory::m_psm[m_read.TEX0.PSM].trbpp;
DWORD cbpp = GSLocalMemory::m_psm[m_read.TEX0.CPSM].trbpp;
DWORD pal = GSLocalMemory::m_psm[m_read.TEX0.PSM].pal;
if(cbpp == 24 && m_read.TEXA.AEM == 0)
{
m_read.amin = m_read.TEXA.TA0;
m_read.amax = m_read.TEXA.TA0;
}
else
{
int amin = 255;
int amax = 0;
const GSVector4i* p = (const GSVector4i*)m_buff32;
for(int i = 0, j = pal >> 4; i < j; i++)
{
GSVector4i v0 = (p[i * 4 + 0] >> 24).ps32(p[i * 4 + 1] >> 24);
GSVector4i v1 = (p[i * 4 + 2] >> 24).ps32(p[i * 4 + 3] >> 24);
GSVector4i v2 = v0.min_i16(v1);
GSVector4i v3 = v0.max_i16(v1);
v2 = v2.min_i16(v2.zwxy());
v3 = v3.max_i16(v3.zwxy());
v2 = v2.min_i16(v2.zwxyl());
v3 = v3.max_i16(v3.zwxyl());
v2 = v2.min_i16(v2.yxwzl());
v3 = v3.max_i16(v3.yxwzl());
amin = min(amin, v2.extract16<0>());
amax = max(amax, v3.extract16<0>());
}
m_read.amin = amin;
m_read.amax = amax;
}
}
amin = m_read.amin;
amax = m_read.amax;
}
//
void GSClut::WriteCLUT_T32_I8_CSM1(const DWORD* RESTRICT src, WORD* RESTRICT clut)

View File

@@ -50,6 +50,8 @@ __declspec(align(16)) class GSClut : public GSAlignedClass<16>
GIFRegTEX0 TEX0;
GIFRegTEXA TEXA;
bool dirty;
bool adirty;
int amin, amax;
bool IsDirty(const GIFRegTEX0& TEX0);
bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
} m_read;
@@ -99,6 +101,7 @@ public:
void Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
void Read(const GIFRegTEX0& TEX0);
void Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA);
void GetAlphaMinMax32(int& amin, int& amax);
DWORD operator [] (size_t i) const {return m_buff32[i];}

View File

@@ -700,10 +700,10 @@ void GSDevice10::StretchRect(Texture& st, const GSVector4& sr, Texture& dt, cons
GSVertexPT1 vertices[] =
{
{GSVector4(left, top), GSVector2(sr.x, sr.y)},
{GSVector4(right, top), GSVector2(sr.z, sr.y)},
{GSVector4(left, bottom), GSVector2(sr.x, sr.w)},
{GSVector4(right, bottom), GSVector2(sr.z, sr.w)},
{GSVector4(left, top, 0.5f, 1.0f), GSVector2(sr.x, sr.y)},
{GSVector4(right, top, 0.5f, 1.0f), GSVector2(sr.z, sr.y)},
{GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(sr.x, sr.w)},
{GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(sr.z, sr.w)},
};
D3D10_BOX box = {0, 0, 0, sizeof(vertices), 1, 1};

View File

@@ -882,10 +882,10 @@ void GSDevice9::StretchRect(Texture& st, const GSVector4& sr, Texture& dt, const
GSVertexPT1 vertices[] =
{
{GSVector4(left, top), GSVector2(sr.x, sr.y)},
{GSVector4(right, top), GSVector2(sr.z, sr.y)},
{GSVector4(left, bottom), GSVector2(sr.x, sr.w)},
{GSVector4(right, bottom), GSVector2(sr.z, sr.w)},
{GSVector4(left, top, 0.5f, 1.0f), GSVector2(sr.x, sr.y)},
{GSVector4(right, top, 0.5f, 1.0f), GSVector2(sr.z, sr.y)},
{GSVector4(left, bottom, 0.5f, 1.0f), GSVector2(sr.x, sr.w)},
{GSVector4(right, bottom, 0.5f, 1.0f), GSVector2(sr.z, sr.w)},
};
for(int i = 0; i < countof(vertices); i++)

File diff suppressed because it is too large Load Diff

View File

@@ -75,6 +75,8 @@ __declspec(align(16)) struct GSScanlineEnvironment
GSVector4i* zbr;
int** fbc;
int** zbc;
GSVector4i* fzbr;
GSVector4i* fzbc;
GSVector4i fm, zm;
struct {GSVector4i min, max, mask;} t; // [u] x 4 [v] x 4
@@ -85,10 +87,9 @@ __declspec(align(16)) struct GSScanlineEnvironment
GSVector4i afix, afix2;
GSVector4i frb, fga;
GSVector4 dz, dz4;
GSVector4i df, df4;
GSVector4 dt, dt4;
GSVector4i drb, dga, dc4;
struct {GSVector4 z, s, t, q; GSVector4i f, rb, ga, _pad;} d[4];
struct {GSVector4 z, stq; GSVector4i f, c;} d4;
GSVector4i rb, ga;
};
__declspec(align(16)) struct GSScanlineParam
@@ -102,56 +103,53 @@ __declspec(align(16)) struct GSScanlineParam
GSLocalMemory::Offset* fbo;
GSLocalMemory::Offset* zbo;
GSLocalMemory::Offset4* fzbo;
DWORD fm, zm;
};
class GSDrawScanline : public GSAlignedClass<16>, public IDrawScanline
{
struct ActiveDrawScanlinePtr
{
UINT64 frame;
UINT64 frames;
__int64 ticks;
__int64 pixels;
DrawScanlinePtr dsf;
};
GSScanlineEnvironment m_env;
DrawScanlinePtr m_ds[4][4][4][2];
CRBMap<DWORD, DrawScanlinePtr> m_dsmap;
CRBMap<DWORD, ActiveDrawScanlinePtr*> m_dsmap_active;
ActiveDrawScanlinePtr* m_dsf;
class GSFunctionMap : public IDrawScanline::FunctionMap
{
public:
DrawScanlinePtr f[4][4][4][2];
virtual DrawScanlinePtr GetDefaultFunction(DWORD dw)
{
GSScanlineSelector sel;
sel.dw = dw;
return f[sel.fpsm][sel.zpsm][sel.ztst][sel.iip];
}
};
GSFunctionMap m_ds;
static const GSVector4 s_ps0123[4];
static const GSVector4i s_test[9];
void Init();
template<DWORD fpsm, DWORD zpsm, DWORD ztst, DWORD iip>
void DrawScanline(int top, int left, int right, const GSVertexSW& v);
template<DWORD sel>
void DrawScanlineEx(int top, int left, int right, const GSVertexSW& v);
__forceinline GSVector4i Wrap(const GSVector4i& t);
__forceinline void SampleTexture(int pixels, DWORD ztst, DWORD fst, DWORD ltf, DWORD tlu, const GSVector4i& test, const GSVector4& s, const GSVector4& t, const GSVector4& q, GSVector4i* c);
__forceinline void SampleTexture(DWORD ztst, DWORD fst, DWORD ltf, DWORD tlu, const GSVector4i& test, const GSVector4& s, const GSVector4& t, const GSVector4& q, GSVector4i* c);
__forceinline void ColorTFX(DWORD tfx, const GSVector4i& rbf, const GSVector4i& gaf, GSVector4i& rbt, GSVector4i& gat);
__forceinline void AlphaTFX(DWORD tfx, DWORD tcc, const GSVector4i& gaf, GSVector4i& gat);
__forceinline void Fog(DWORD fge, const GSVector4i& f, GSVector4i& rb, GSVector4i& ga);
__forceinline bool TestZ(DWORD zpsm, DWORD ztst, const GSVector4i& zs, const GSVector4i& za, GSVector4i& test);
__forceinline bool TestZ(DWORD zpsm, DWORD ztst, const GSVector4i& zs, const GSVector4i& zd, GSVector4i& test);
__forceinline bool TestAlpha(DWORD atst, DWORD afail, const GSVector4i& ga, GSVector4i& fm, GSVector4i& zm, GSVector4i& test);
__forceinline bool TestDestAlpha(DWORD fpsm, DWORD date, const GSVector4i& d, GSVector4i& test);
__forceinline static DWORD ReadPixel32(DWORD* RESTRICT vm, DWORD addr);
__forceinline static DWORD ReadPixel24(DWORD* RESTRICT vm, DWORD addr);
__forceinline static DWORD ReadPixel16(WORD* RESTRICT vm, DWORD addr);
__forceinline static void WritePixel32(DWORD* RESTRICT vm, DWORD addr, DWORD c);
__forceinline static void WritePixel24(DWORD* RESTRICT vm, DWORD addr, DWORD c);
__forceinline static void WritePixel16(WORD* RESTRICT vm, DWORD addr, DWORD c);
__forceinline GSVector4i ReadFrameX(int psm, const GSVector4i& addr) const;
__forceinline GSVector4i ReadZBufX(int psm, const GSVector4i& addr) const;
__forceinline void WriteFrameAndZBufX(int fpsm, const GSVector4i& fa, const GSVector4i& fm, const GSVector4i& f, int zpsm, const GSVector4i& za, const GSVector4i& zm, const GSVector4i& z, int pixels);
__forceinline void WriteFrameX(int fpsm, int rfb, GSVector4i* c, const GSVector4i& fd, const GSVector4i& fm, const GSVector4i& fza, int fzm);
__forceinline void WriteZBufX(int zpsm, int ztst, const GSVector4i& z, const GSVector4i& zd, const GSVector4i& zm, const GSVector4i& fza, int fzm);
void DrawSolidRect(const GSVector4i& r, const GSVertexSW& v);
@@ -164,6 +162,12 @@ class GSDrawScanline : public GSAlignedClass<16>, public IDrawScanline
template<class T, bool masked>
__forceinline void FillBlock(const GSVector4i* row, int* col, const GSVector4i& r, const GSVector4i& c, const GSVector4i& m);
template<DWORD fpsm, DWORD zpsm, DWORD ztst, DWORD iip>
void DrawScanline(int top, int left, int right, const GSVertexSW& v);
template<DWORD sel>
void DrawScanlineEx(int top, int left, int right, const GSVertexSW& v);
protected:
GSState* m_state;
int m_id;
@@ -174,8 +178,8 @@ public:
// IDrawScanline
void BeginDraw(const GSRasterizerData* data, DrawScanlinePtr* dsf, DrawSolidRectPtr* dsrf);
void BeginDraw(const GSRasterizerData* data, Functions* f);
void EndDraw(const GSRasterizerStats& stats);
void SetupPrim(GS_PRIM_CLASS primclass, const GSVertexSW* vertices, const GSVertexSW& dscan);
void PrintStats();
void PrintStats() {m_ds.PrintStats();}
};

View File

@@ -402,6 +402,15 @@ GSLocalMemory::~GSLocalMemory()
}
m_omap.RemoveAll();
pos = m_o4map.GetHeadPosition();
while(pos)
{
_aligned_free(m_o4map.GetNextValue(pos));
}
m_o4map.RemoveAll();
}
GSLocalMemory::Offset* GSLocalMemory::GetOffset(DWORD bp, DWORD bw, DWORD psm, Offset* o)
@@ -428,7 +437,7 @@ GSLocalMemory::Offset* GSLocalMemory::GetOffset(DWORD bp, DWORD bw, DWORD psm, O
pixelAddress pa = m_psm[psm].pa;
for(int i = 0, j = 2048; i < j; i++)
for(int i = 0; i < 2048; i++)
{
o->row[i] = GSVector4i((int)pa(0, i, bp, bw));
}
@@ -449,6 +458,57 @@ GSLocalMemory::Offset* GSLocalMemory::GetOffset(DWORD bp, DWORD bw, DWORD psm, O
return o;
}
GSLocalMemory::Offset4* GSLocalMemory::GetOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF, Offset4* o)
{
DWORD fbp = FRAME.Block();
DWORD zbp = ZBUF.Block();
DWORD fpsm = FRAME.PSM;
DWORD zpsm = ZBUF.PSM;
DWORD bw = FRAME.FBW;
ASSERT(m_psm[fpsm].bpp > 8 || m_psm[zpsm].bpp > 8); // only for 16/24/32 formats
DWORD hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (((fpsm & 3) | (fpsm >> 3)) << 24) | (((zpsm & 3) | (zpsm >> 3)) << 28);
if(!o || o->hash != hash)
{
CRBMap<DWORD, Offset4*>::CPair* pair = m_o4map.Lookup(hash);
if(pair)
{
o = pair->m_value;
}
else
{
o = (Offset4*)_aligned_malloc(sizeof(Offset4), 16);
o->hash = hash;
pixelAddress fpa = m_psm[fpsm].pa;
pixelAddress zpa = m_psm[zpsm].pa;
for(int i = 0; i < 2048; i++)
{
o->row[i] = GSVector4i((int)fpa(0, i, fbp, bw), (int)zpa(0, i, zbp, bw)).xxyy();
}
for(int i = 0; i < 512; i++)
{
int f0 = m_psm[fpsm].rowOffset[0][i * 4 + 0];
int f2 = m_psm[fpsm].rowOffset[0][i * 4 + 2];
int z0 = m_psm[zpsm].rowOffset[0][i * 4 + 0];
int z2 = m_psm[zpsm].rowOffset[0][i * 4 + 2];
o->col[i] = GSVector4i(f0, f2, z0, z2);
}
m_o4map.SetAt(hash, o);
}
}
return o;
}
bool GSLocalMemory::FillRect(const GSVector4i& r, DWORD c, DWORD psm, DWORD bp, DWORD bw)
{
const psm_t& tbl = m_psm[psm];

View File

@@ -77,8 +77,15 @@ public:
struct Offset
{
GSVector4i row[2048];
int* col[4];
GSVector4i row[2048]; // 0 | 0 | 0 | 0
int* col[4]; // x | x+1 | x+2 | x+3
DWORD hash;
};
struct Offset4
{
GSVector4i row[2048]; // f 0 | f 0 | z 0 | z 0
GSVector4i col[512]; // f x | f x+2 | z x | z x+2 ((x & 3) == 0)
DWORD hash;
};
@@ -118,12 +125,14 @@ protected:
//
CRBMapC<DWORD, Offset*> m_omap;
CRBMapC<DWORD, Offset4*> m_o4map;
public:
GSLocalMemory();
virtual ~GSLocalMemory();
Offset* GetOffset(DWORD bp, DWORD bw, DWORD psm, Offset* o = NULL);
Offset4* GetOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF, Offset4* o = NULL);
// address

View File

@@ -36,10 +36,10 @@ GSRasterizer::~GSRasterizer()
void GSRasterizer::Draw(const GSRasterizerData* data)
{
m_dsf = NULL;
m_dsrf = NULL;
m_dsf.sl = NULL;
m_dsf.sr = NULL;
m_ds->BeginDraw(data, &m_dsf, &m_dsrf);
m_ds->BeginDraw(data, &m_dsf);
const GSVector4i scissor = data->scissor;
const GSVertexSW* vertices = data->vertices;
@@ -94,7 +94,9 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v, const GSVector4i& scissor)
{
if((p.y % m_threads) == m_id)
{
(m_ds->*m_dsf)(p.y, p.x, p.x + 1, *v);
m_ds->SetupPrim(GS_POINT_CLASS, v, *v);
(m_ds->*m_dsf.sl)(p.y, p.x, p.x + 1, *v);
m_stats.pixels++;
}
@@ -208,19 +210,32 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices, const GSVector4i& sc
void GSRasterizer::DrawTriangleTop(GSVertexSW* v, const GSVector4i& scissor)
{
GSVertexSW longest = v[2] - v[1];
if((longest.p == GSVector4::zero()).mask() & 1)
{
return;
}
GSVertexSW longest;
GSVertexSW dscan = longest * longest.p.xxxx().rcp();
longest.p = v[2].p - v[1].p;
int i = (longest.p > GSVector4::zero()).mask() & 1;
int i = (longest.p > GSVector4::zero()).upl(longest.p == GSVector4::zero()).mask();
if(i & 2) return;
i &= 1;
GSVertexSW& l = v[0];
GSVector4 r = v[0].p;
GSVector4& r = v[0].p;
GSVector4i tb(l.p.xyxy(v[2].p).ceil());
int top = tb.extract32<1>();
int bottom = tb.extract32<3>();
if(top < scissor.y) top = scissor.y;
if(bottom > scissor.w) bottom = scissor.w;
if(top >= bottom) return;
longest.t = v[2].t - v[1].t;
longest.c = v[2].c - v[1].c;
GSVertexSW dscan = longest * longest.p.xxxx().rcp();
GSVertexSW vl = v[2 - i] - l;
GSVector4 vr = v[1 + i].p - r;
@@ -228,93 +243,79 @@ void GSRasterizer::DrawTriangleTop(GSVertexSW* v, const GSVector4i& scissor)
GSVertexSW dl = vl / vl.p.yyyy();
GSVector4 dr = vr / vr.yyyy();
GSVector4i tb(l.p.xyxy(v[2].p).ceil());
float py = (float)top - l.p.y;
int top = tb.y;
int bottom = tb.w;
l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y
dl.p = dl.p.upl(dr).xyzw(dl.p); // dr.x => dl.y
if(top < scissor.y) top = scissor.y;
if(bottom > scissor.w) bottom = scissor.w;
if(py > 0) l += dl * py;
if(top < bottom)
{
float py = (float)top - l.p.y;
m_ds->SetupPrim(GS_TRIANGLE_CLASS, v, dscan);
if(py > 0)
{
GSVector4 dy(py);
l += dl * dy;
r += dr * dy;
}
m_ds->SetupPrim(GS_TRIANGLE_CLASS, v, dscan);
DrawTriangleSection(top, bottom, l, dl, r, dr, dscan, scissor);
}
DrawTriangleSection(top, bottom, l, dl, dscan, scissor);
}
void GSRasterizer::DrawTriangleBottom(GSVertexSW* v, const GSVector4i& scissor)
{
GSVertexSW longest = v[1] - v[0];
GSVertexSW longest;
if((longest.p == GSVector4::zero()).mask() & 1)
{
return;
}
GSVertexSW dscan = longest * longest.p.xxxx().rcp();
longest.p = v[1].p - v[0].p;
int i = (longest.p > GSVector4::zero()).mask() & 1;
int i = (longest.p > GSVector4::zero()).upl(longest.p == GSVector4::zero()).mask();
if(i & 2) return;
i &= 1;
GSVertexSW& l = v[1 - i];
GSVector4& r = v[i].p;
GSVector4i tb(l.p.xyxy(v[2].p).ceil());
int top = tb.extract32<1>();
int bottom = tb.extract32<3>();
if(top < scissor.y) top = scissor.y;
if(bottom > scissor.w) bottom = scissor.w;
if(top >= bottom) return;
longest.t = v[1].t - v[0].t;
longest.c = v[1].c - v[0].c;
GSVertexSW dscan = longest * longest.p.xxxx().rcp();
GSVertexSW vl = v[2] - l;
GSVector4 vr = v[2].p - r;
GSVertexSW dl = vl / vl.p.yyyy();
GSVector4 dr = vr / vr.yyyy();
GSVector4i tb(l.p.xyxy(v[2].p).ceil());
float py = (float)top - l.p.y;
int top = tb.y;
int bottom = tb.w;
l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y
dl.p = dl.p.upl(dr).xyzw(dl.p); // dr.x => dl.y
if(top < scissor.y) top = scissor.y;
if(bottom > scissor.w) bottom = scissor.w;
if(top < bottom)
{
float py = (float)top - l.p.y;
if(py > 0) l += dl * py;
if(py > 0)
{
GSVector4 dy(py);
m_ds->SetupPrim(GS_TRIANGLE_CLASS, v, dscan);
l += dl * dy;
r += dr * dy;
}
m_ds->SetupPrim(GS_TRIANGLE_CLASS, v, dscan);
DrawTriangleSection(top, bottom, l, dl, r, dr, dscan, scissor);
}
DrawTriangleSection(top, bottom, l, dl, dscan, scissor);
}
void GSRasterizer::DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scissor)
{
GSVertexSW v01, v02, v12;
GSVertexSW dv[3];
v01 = v[1] - v[0];
v02 = v[2] - v[0];
dv[0] = v[1] - v[0];
dv[1] = v[2] - v[0];
GSVertexSW longest = v[0] + v02 * (v01.p / v02.p).yyyy() - v[1];
GSVertexSW longest = v[0] + dv[1] * (dv[0].p / dv[1].p).yyyy() - v[1];
if((longest.p == GSVector4::zero()).mask() & 1)
{
return;
}
int i = (longest.p > GSVector4::zero()).upl(longest.p == GSVector4::zero()).mask();
if(i & 2) return;
i &= 1;
GSVertexSW dscan = longest * longest.p.xxxx().rcp();
@@ -326,18 +327,8 @@ void GSRasterizer::DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scisso
GSVertexSW dl;
GSVector4 dr;
bool b = (longest.p > GSVector4::zero()).mask() & 1;
if(b)
{
dl = v01 / v01.p.yyyy();
dr = v02.p / v02.p.yyyy();
}
else
{
dl = v02 / v02.p.yyyy();
dr = v01.p / v01.p.yyyy();
}
dl = dv[1 - i] / dv[1 - i].p.yyyy();
dr = dv[i].p / dv[i].p.yyyy();
GSVector4i tb(v[0].p.yyyy(v[1].p).xzyy(v[2].p).ceil());
@@ -362,21 +353,21 @@ void GSRasterizer::DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scisso
DrawTriangleSection(top, bottom, l, dl, r, dr, dscan, scissor);
}
if(b)
if(i)
{
v12 = v[2] - v[1];
l = v[1];
dl = v12 / v12.p.yyyy();
dv[2] = v[2] - v[1];
dl = dv[2] / dv[2].p.yyyy();
}
else
{
v12.p = v[2].p - v[1].p;
r = v[1].p;
dr = v12.p / v12.p.yyyy();
dv[2].p = v[2].p - v[1].p;
dr = dv[2].p / dv[2].p.yyyy();
}
top = tb.y;
@@ -395,7 +386,10 @@ void GSRasterizer::DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scisso
if(py > 0) r += dr * py;
DrawTriangleSection(top, bottom, l, dl, r, dr, dscan, scissor);
l.p = l.p.upl(r).xyzw(l.p); // r.x => l.y
dl.p = dl.p.upl(dr).xyzw(dl.p); // dr.x => dl.y
DrawTriangleSection(top, bottom, l, dl, dscan, scissor);
}
}
@@ -407,28 +401,12 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
{
do
{
// rarely used (character shadows in ffx-2)
/*
int scanmsk = (int)m_state->m_env.SCANMSK.MSK - 2;
if(scanmsk >= 0)
{
if(((top & 1) ^ scanmsk) == 0)
{
continue;
}
}
*/
if((top % m_threads) == m_id)
{
GSVector4i lr(l.p.xyxy(r).ceil());
int& left = lr.x;
int& right = lr.z;
int left = lr.extract32<0>();
int right = lr.extract32<2>();
if(left < scissor.x) left = scissor.x;
if(right > scissor.z) right = scissor.z;
@@ -439,13 +417,20 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
{
m_stats.pixels += pixels;
GSVertexSW scan = l;
GSVertexSW scan;
float px = (float)left - l.p.x;
if(px > 0) scan += dscan * px;
if(px > 0)
{
scan = l + dscan * px;
}
else
{
scan = l;
}
(m_ds->*m_dsf)(top, left, right, scan);
(m_ds->*m_dsf.sl)(top, left, right, scan);
}
}
}
@@ -458,6 +443,56 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
}
}
void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan, const GSVector4i& scissor)
{
ASSERT(top < bottom);
while(1)
{
do
{
if((top % m_threads) == m_id)
{
GSVector4i lr(l.p.ceil());
int left = lr.extract32<0>();
int right = lr.extract32<1>();
if(left < scissor.x) left = scissor.x;
if(right > scissor.z) right = scissor.z;
int pixels = right - left;
if(pixels > 0)
{
m_stats.pixels += pixels;
GSVertexSW scan;
float px = (float)left - l.p.x;
if(px > 0)
{
scan = l + dscan * px;
}
else
{
scan = l;
}
(m_ds->*m_dsf.sl)(top, left, right, scan);
}
}
}
while(0);
if(++top >= bottom) break;
l += dl;
}
}
void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scissor)
{
GSVertexSW v[2];
@@ -499,11 +534,11 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis
GSVertexSW scan = v[0];
if(m_dsrf)
if(m_dsf.sr)
{
if(m_id == 0)
{
(m_ds->*m_dsrf)(r, scan);
(m_ds->*m_dsf.sr)(r, scan);
m_stats.pixels += (r.z - r.x) * (r.w - r.y);
}
@@ -518,6 +553,9 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis
dedge.p = zero;
dscan.p = zero;
dedge.c = zero;
dscan.c = zero;
GSVertexSW dv = v[1] - v[0];
dedge.t = (dv.t / dv.p.yyyy()).xyxy(zero).wyww();
@@ -532,7 +570,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis
{
if((top % m_threads) == m_id)
{
(m_ds->*m_dsf)(top, left, right, scan);
(m_ds->*m_dsf.sl)(top, left, right, scan);
m_stats.pixels += right - left;
}
@@ -617,6 +655,8 @@ GSRasterizerList::GSRasterizerList()
// get a whole cache line (twice the size for future cpus ;)
m_sync = (long*)_aligned_malloc(sizeof(*m_sync), 128);
*m_sync = 0;
}
GSRasterizerList::~GSRasterizerList()
@@ -681,3 +721,136 @@ void GSRasterizerList::PrintStats()
GetHead()->PrintStats();
}
}
//
IDrawScanline::FunctionMap::FunctionMap()
: m_active(NULL)
{
}
IDrawScanline::FunctionMap::~FunctionMap()
{
POSITION pos = m_map_active.GetHeadPosition();
while(pos)
{
delete m_map_active.GetNextValue(pos);
}
m_map_active.RemoveAll();
}
void IDrawScanline::FunctionMap::SetAt(DWORD sel, DrawScanlinePtr f)
{
m_map.SetAt(sel, f);
}
IDrawScanline::DrawScanlinePtr IDrawScanline::FunctionMap::Lookup(DWORD sel)
{
m_active = NULL;
if(!m_map_active.Lookup(sel, m_active))
{
CRBMap<DWORD, DrawScanlinePtr>::CPair* pair = m_map.Lookup(sel);
ActiveDrawScanlinePtr* p = new ActiveDrawScanlinePtr();
memset(p, 0, sizeof(*p));
p->frame = (UINT64)-1;
p->f = pair ? pair->m_value : GetDefaultFunction(sel);
m_map_active.SetAt(sel, p);
m_active = p;
}
return m_active->f;
}
void IDrawScanline::FunctionMap::UpdateStats(const GSRasterizerStats& stats, UINT64 frame)
{
if(m_active)
{
if(m_active->frame != frame)
{
m_active->frame = frame;
m_active->frames++;
}
m_active->pixels += stats.pixels;
m_active->ticks += stats.ticks;
}
}
void IDrawScanline::FunctionMap::PrintStats()
{
if(FILE* fp = fopen("c:\\1.txt", "w"))
{
POSITION pos = m_map_active.GetHeadPosition();
while(pos)
{
DWORD sel;
ActiveDrawScanlinePtr* p;
m_map_active.GetNextAssoc(pos, sel, p);
if(m_map.Lookup(sel))
{
continue;
}
if(p->frames > 30)
{
int tpf = (int)((p->ticks / p->frames) * 10000 / (3000000000 / 60)); // 3 GHz, 60 fps
if(tpf >= 500)
{
_ftprintf(fp, _T("InitDS_Sel(0x%08x); // %6.2f%%\n"), sel, (float)tpf / 100);
}
}
}
fclose(fp);
}
{
__int64 ttpf = 0;
POSITION pos = m_map_active.GetHeadPosition();
while(pos)
{
ActiveDrawScanlinePtr* p = m_map_active.GetNextValue(pos);
ttpf += p->ticks / p->frames;
}
pos = m_map_active.GetHeadPosition();
while(pos)
{
DWORD sel;
ActiveDrawScanlinePtr* p;
m_map_active.GetNextAssoc(pos, sel, p);
if(p->frames > 0)
{
__int64 tpp = p->pixels > 0 ? p->ticks / p->pixels : 0;
__int64 tpf = p->frames > 0 ? p->ticks / p->frames : 0;
__int64 ppf = p->frames > 0 ? p->pixels / p->frames : 0;
printf("[%08x]%c %6.2f%% | %5.2f%% | f %4I64d | p %10I64d | tpp %4I64d | tpf %9I64d | ppf %7I64d\n",
sel, !m_map.Lookup(sel) ? '*' : ' ',
(float)(tpf * 10000 / 50000000) / 100,
(float)(tpf * 10000 / ttpf) / 100,
p->frames, p->pixels,
tpp, tpf, ppf);
}
}
}
}

View File

@@ -63,9 +63,40 @@ public:
typedef void (IDrawScanline::*DrawScanlinePtr)(int top, int left, int right, const GSVertexSW& v);
typedef void (IDrawScanline::*DrawSolidRectPtr)(const GSVector4i& r, const GSVertexSW& v);
struct Functions
{
DrawScanlinePtr sl;
DrawSolidRectPtr sr;
};
class FunctionMap
{
struct ActiveDrawScanlinePtr
{
UINT64 frame, frames;
__int64 ticks, pixels;
DrawScanlinePtr f;
};
CRBMap<DWORD, DrawScanlinePtr> m_map;
CRBMap<DWORD, ActiveDrawScanlinePtr*> m_map_active;
ActiveDrawScanlinePtr* m_active;
protected:
virtual DrawScanlinePtr GetDefaultFunction(DWORD sel) = 0;
public:
FunctionMap();
virtual ~FunctionMap();
void SetAt(DWORD sel, DrawScanlinePtr f);
DrawScanlinePtr Lookup(DWORD sel);
void UpdateStats(const GSRasterizerStats& stats, UINT64 frame);
void PrintStats();
};
virtual ~IDrawScanline() {}
virtual void BeginDraw(const GSRasterizerData* data, DrawScanlinePtr* dsf, DrawSolidRectPtr* dsrf) = 0;
virtual void BeginDraw(const GSRasterizerData* data, Functions* dsf) = 0;
virtual void EndDraw(const GSRasterizerStats& stats) = 0;
virtual void SetupPrim(GS_PRIM_CLASS primclass, const GSVertexSW* vertices, const GSVertexSW& dscan) = 0;
virtual void PrintStats() = 0;
@@ -75,8 +106,7 @@ class GSRasterizer : public IRasterizer
{
protected:
IDrawScanline* m_ds;
IDrawScanline::DrawScanlinePtr m_dsf;
IDrawScanline::DrawSolidRectPtr m_dsrf;
IDrawScanline::Functions m_dsf;
int m_id;
int m_threads;
GSRasterizerStats m_stats;
@@ -91,6 +121,8 @@ protected:
void DrawTriangleTopBottom(GSVertexSW* v, const GSVector4i& scissor);
__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, GSVector4& r, const GSVector4& dr, const GSVertexSW& dscan, const GSVector4i& scissor);
__forceinline void DrawTriangleSection(int top, int bottom, GSVertexSW& l, const GSVertexSW& dl, const GSVertexSW& dscan, const GSVector4i& scissor);
__forceinline void DrawScanline(int top, int left, int right, const GSVertexSW& scan, const GSVertexSW& dscan);
public:
GSRasterizer(IDrawScanline* ds, int id = 0, int threads = 0);

View File

@@ -590,10 +590,10 @@ void GSRendererHW10::SetupDATE(Texture& rt, Texture& ds)
GSVertexPT1 vertices[] =
{
{GSVector4(mm.x, -mm.y), GSVector2(uv.x, uv.y)},
{GSVector4(mm.z, -mm.y), GSVector2(uv.z, uv.y)},
{GSVector4(mm.x, -mm.w), GSVector2(uv.x, uv.w)},
{GSVector4(mm.z, -mm.w), GSVector2(uv.z, uv.w)},
{GSVector4(mm.x, -mm.y, 0.5f, 1.0f), GSVector2(uv.x, uv.y)},
{GSVector4(mm.z, -mm.y, 0.5f, 1.0f), GSVector2(uv.z, uv.y)},
{GSVector4(mm.x, -mm.w, 0.5f, 1.0f), GSVector2(uv.x, uv.w)},
{GSVector4(mm.z, -mm.w, 0.5f, 1.0f), GSVector2(uv.z, uv.w)},
};
D3D10_BOX box = {0, 0, 0, sizeof(vertices), 1, 1};

View File

@@ -547,10 +547,10 @@ void GSRendererHW9::SetupDATE(Texture& rt, Texture& ds)
GSVertexPT1 vertices[] =
{
{GSVector4(mm.x, -mm.y), GSVector2(uv.x, uv.y)},
{GSVector4(mm.z, -mm.y), GSVector2(uv.z, uv.y)},
{GSVector4(mm.x, -mm.w), GSVector2(uv.x, uv.w)},
{GSVector4(mm.z, -mm.w), GSVector2(uv.z, uv.w)},
{GSVector4(mm.x, -mm.y, 0.5f, 1.0f), GSVector2(uv.x, uv.y)},
{GSVector4(mm.z, -mm.y, 0.5f, 1.0f), GSVector2(uv.z, uv.y)},
{GSVector4(mm.x, -mm.w, 0.5f, 1.0f), GSVector2(uv.x, uv.w)},
{GSVector4(mm.z, -mm.w, 0.5f, 1.0f), GSVector2(uv.z, uv.w)},
};
m_dev.IASetVertexBuffer(4, vertices);

View File

@@ -37,10 +37,12 @@ protected:
bool m_reset;
GSLocalMemory::Offset* m_fbo;
GSLocalMemory::Offset* m_zbo;
GSLocalMemory::Offset4* m_fzbo;
__declspec(align(16)) struct VertexTrace
{
GSVertexSW v;
GSVector4 tmin, tmax;
GSVector4 cmin, cmax;
union
{
@@ -49,12 +51,12 @@ protected:
struct {DWORD stq:4, rgba:4;};
} eq;
bool first;
void Reset()
{
first = true;
eq.value = 0xffffffff;
tmin = GSVector4(FLT_MAX);
tmax = GSVector4(-FLT_MAX);
cmin = GSVector4(FLT_MAX);
cmax = GSVector4::zero();
}
VertexTrace() {Reset();}
@@ -86,8 +88,7 @@ protected:
m_reset = false;
}
//
if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
// if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats();
}
void ResetDevice()
@@ -154,30 +155,29 @@ protected:
v.c = GSVector4((DWORD)m_v.RGBAQ.ai32[0]) * 128.0f;
m_vtrace.cmin = m_vtrace.cmin.minv(v.c);
m_vtrace.cmax = m_vtrace.cmax.maxv(v.c);
if(PRIM->TME)
{
float q;
if(PRIM->FST)
{
v.t = GSVector4(GSVector4i((int)m_v.UV.U, (int)m_v.UV.V) << (16 - 4));
v.t.z = 1.0f;
v.t = GSVector4(GSVector4i((int)m_v.UV.U, (int)m_v.UV.V, 0, 0) << (16 - 4));
q = 1.0f;
}
else
{
v.t = GSVector4(m_v.ST.S, m_v.ST.T, 0.0f, 0.0f);
v.t *= GSVector4((float)(0x10000 << m_context->TEX0.TW), (float)(0x10000 << m_context->TEX0.TH));
v.t.z = m_v.RGBAQ.Q;
v.t *= GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH);
q = m_v.RGBAQ.Q;
}
}
if(m_vtrace.first)
{
m_vtrace.v.t = v.t;
m_vtrace.v.c = v.c;
m_vtrace.first = false;
}
else
{
m_vtrace.eq.value &= (m_vtrace.v.t == v.t).mask() | ((m_vtrace.v.c == v.c).mask() << 4); // v.p not needed
v.t = v.t.xyxy(GSVector4::load(q));
m_vtrace.tmin = m_vtrace.tmin.minv(v.t);
m_vtrace.tmax = m_vtrace.tmax.maxv(v.t);
}
m_vl.AddTail() = v;
@@ -265,6 +265,136 @@ protected:
return v;
}
bool TryAlphaTest(DWORD& fm, DWORD& zm)
{
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
bool pass = true;
if(context->TEST.ATST == ATST_NEVER)
{
pass = false;
}
else if(context->TEST.ATST != ATST_ALWAYS)
{
GSVector4i a = GSVector4i(m_vtrace.cmin.wwww(m_vtrace.cmax)) >> 7;
int amin, amax;
if(PRIM->TME && (context->TEX0.TCC || context->TEX0.TFX == TFX_DECAL))
{
DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp;
DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp;
DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal;
if(bpp == 32)
{
return false;
}
else if(bpp == 24)
{
amin = env.TEXA.AEM ? 0 : env.TEXA.TA0;
amax = env.TEXA.TA0;
}
else if(bpp == 16)
{
amin = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1);
amax = max(env.TEXA.TA0, env.TEXA.TA1);
}
else
{
m_mem.m_clut.GetAlphaMinMax32(amin, amax);
}
switch(context->TEX0.TFX)
{
case TFX_MODULATE:
amin = (amin * a.x) >> 7;
amax = (amax * a.z) >> 7;
if(amin > 255) amin = 255;
if(amax > 255) amax = 255;
break;
case TFX_DECAL:
break;
case TFX_HIGHLIGHT:
amin = amin + a.x;
amax = amax + a.z;
if(amin > 255) amin = 255;
if(amax > 255) amax = 255;
break;
case TFX_HIGHLIGHT2:
break;
default:
__assume(0);
}
}
else
{
amin = a.x;
amax = a.z;
}
int aref = context->TEST.AREF;
switch(context->TEST.ATST)
{
case ATST_NEVER:
pass = false;
break;
case ATST_ALWAYS:
pass = true;
break;
case ATST_LESS:
if(amax < aref) pass = true;
else if(amin >= aref) pass = false;
else return false;
break;
case ATST_LEQUAL:
if(amax <= aref) pass = true;
else if(amin > aref) pass = false;
else return false;
break;
case ATST_EQUAL:
if(amin == aref && amax == aref) pass = true;
else if(amin > aref || amax < aref) pass = false;
else return false;
break;
case ATST_GEQUAL:
if(amin >= aref) pass = true;
else if(amax < aref) pass = false;
else return false;
break;
case ATST_GREATER:
if(amin > aref) pass = true;
else if(amax <= aref) pass = false;
else return false;
break;
case ATST_NOTEQUAL:
if(amin == aref && amax == aref) pass = false;
else if(amin > aref || amax < aref) pass = true;
else return false;
break;
default:
__assume(0);
}
}
if(!pass)
{
switch(context->TEST.AFAIL)
{
case AFAIL_KEEP: fm = zm = 0xffffffff; break;
case AFAIL_FB_ONLY: zm = 0xffffffff; break;
case AFAIL_ZB_ONLY: fm = 0xffffffff; break;
case AFAIL_RGB_ONLY: fm |= 0xff000000; zm = 0xffffffff; break;
default: __assume(0);
}
}
return true;
}
void Draw()
{
const GSDrawingEnvironment& env = m_env;
@@ -274,15 +404,28 @@ protected:
//
m_vtrace.eq.value = ((m_vtrace.tmin == m_vtrace.tmax).mask() | ((m_vtrace.cmin == m_vtrace.cmax).mask() << 4));
//
if(PRIM->TME)
{
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
}
//
GSScanlineParam p;
p.vm = m_mem.m_vm32;
m_fbo = m_mem.GetOffset(context->FRAME.Block(), context->FRAME.FBW, context->FRAME.PSM, m_fbo);
m_zbo = m_mem.GetOffset(context->ZBUF.Block(), context->FRAME.FBW, context->ZBUF.PSM, m_zbo);
m_fzbo = m_mem.GetOffset4(context->FRAME, context->ZBUF, m_fzbo);
p.fbo = m_fbo;
p.zbo = m_zbo;
p.fzbo = m_fzbo;
p.sel.dw = 0;
@@ -297,8 +440,6 @@ protected:
p.fm = context->FRAME.FBMSK;
p.zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
//
if(context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER)
{
p.fm = 0xffffffff;
@@ -307,46 +448,7 @@ protected:
if(context->TEST.ATE)
{
bool pass = context->TEST.ATST != ATST_NEVER;
if((!PRIM->TME || !context->TEX0.TCC && context->TEX0.TFX != TFX_DECAL) && m_vtrace.eq.a)
{
// surprisingly large number of games leave alpha test on when alpha is constant
DWORD a = (DWORD)((int)m_vtrace.v.c.a >> 7);
DWORD aref = context->TEST.AREF;
switch(context->TEST.ATST)
{
case ATST_NEVER: pass = false; break;
case ATST_ALWAYS: pass = true; break;
case ATST_LESS: pass = a < aref; break;
case ATST_LEQUAL: pass = a <= aref; break;
case ATST_EQUAL: pass = a == aref; break;
case ATST_GEQUAL: pass = a >= aref; break;
case ATST_GREATER: pass = a > aref; break;
case ATST_NOTEQUAL: pass = a != aref; break;
default: __assume(0);
}
}
if(!pass)
{
switch(context->TEST.AFAIL)
{
case AFAIL_KEEP: p.fm = p.zm = 0xffffffff; break;
case AFAIL_FB_ONLY: p.zm = 0xffffffff; break;
case AFAIL_ZB_ONLY: p.fm = 0xffffffff; break;
case AFAIL_RGB_ONLY: p.fm |= 0xff000000; p.zm = 0xffffffff; break;
default: __assume(0);
}
// "don't care" values
p.sel.atst = ATST_ALWAYS;
p.sel.afail = 0;
}
else
if(!TryAlphaTest(p.fm, p.zm))
{
p.sel.atst = context->TEST.ATST;
p.sel.afail = context->TEST.AFAIL;
@@ -375,7 +477,7 @@ protected:
if(p.sel.iip == 0 && p.sel.tfx == TFX_MODULATE && p.sel.tcc)
{
if(m_vtrace.eq.rgba == 15 && (m_vtrace.v.c == GSVector4(128.0f * 128.0f)).alltrue())
if(m_vtrace.eq.rgba == 15 && (m_vtrace.cmin == GSVector4(128.0f * 128.0f)).alltrue())
{
// modulate does not do anything when vertex color is 0x80
@@ -383,6 +485,11 @@ protected:
}
}
if(p.sel.tfx == TFX_DECAL)
{
p.sel.tcc = 1;
}
if(p.sel.fst == 0)
{
// skip per pixel division if q is constant
@@ -418,11 +525,13 @@ protected:
{
// if q is constant we can do the half pel shift for bilinear sampling on the vertices
GSVertexSW* v = m_vertices;
GSVector4 half((float)0x8000, (float)0x8000, 0.0f, 0.0f);
for(int i = 0; i < m_count; i++)
for(int i = 0, j = m_count; i < j; i++)
{
m_vertices[i].t -= half;
v[i].t -= half;
}
}
@@ -481,8 +590,6 @@ protected:
if(!t) {ASSERT(0); return;}
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
p.tex = t->m_buff;
p.clut = m_mem.m_clut;
p.tw = t->m_tw;
@@ -514,9 +621,11 @@ protected:
}
if(p.sel.date
|| p.sel.abe != 255
|| p.sel.abea == 1 || p.sel.abeb == 1 || p.sel.abec == 1 || p.sel.abed == 1
|| p.sel.atst != ATST_ALWAYS && p.sel.afail == AFAIL_RGB_ONLY
|| p.fm != 0 && p.fm != 0xffffffff)
|| p.sel.fpsm == 0 && p.fm != 0 && p.fm != 0xffffffff
|| p.sel.fpsm == 1 && (p.fm & 0x00ffffff) != 0 && (p.fm & 0x00ffffff) != 0x00ffffff
|| p.sel.fpsm == 2 && (p.fm & 0x80f8f8f8) != 0 && (p.fm & 0x80f8f8f8) != 0x80f8f8f8)
{
p.sel.rfb = 1;
}
@@ -597,8 +706,8 @@ __int64 start = __rdtsc();
/*
__int64 diff = __rdtsc() - start;
s_total += diff;
if(pixels >= 50000)
fprintf(s_fp, "[%I64d, %d, %d, %d] %08x, diff = %I64d /prim = %I64d /pixel = %I64d \n", frame, PRIM->PRIM, prims, pixels, p.sel, diff, diff / prims, pixels > 0 ? diff / pixels : 0);
if(stats.pixels >= 50000)
fprintf(s_fp, "[%I64d, %d, %d, %d] %08x, diff = %I64d /prim = %I64d /pixel = %I64d \n", frame, PRIM->PRIM, stats.prims, stats.pixels, p.sel, diff, diff / stats.prims, stats.pixels > 0 ? diff / stats.pixels : 0);
*/
// TODO
@@ -664,6 +773,7 @@ public:
: GSRendererT(base, mt, irq, nloophack, rs)
, m_fbo(NULL)
, m_zbo(NULL)
, m_fzbo(NULL)
{
m_rl.Create<GSDrawScanline>(this, threads);

View File

@@ -1019,6 +1019,8 @@ public:
return v;
}
#endif
template<int src, class T> __forceinline GSVector4i gather16_4(const T* ptr) const
{
GSVector4i v;
@@ -1095,6 +1097,8 @@ public:
return v;
}
#if _M_SSE >= 0x401
template<int src, class T> __forceinline GSVector4i gather32_4(const T* ptr) const
{
GSVector4i v;
@@ -1154,7 +1158,56 @@ public:
return v;
}
#ifdef _M_AMD64
#else
template<int src, class T> __forceinline GSVector4i gather32_4(const T* ptr) const
{
return GSVector4i(
(int)ptr[extract8<src + 0>() & 0xf],
(int)ptr[extract8<src + 0>() >> 4],
(int)ptr[extract8<src + 1>() & 0xf],
(int)ptr[extract8<src + 1>() >> 4]);
}
template<int src, class T> __forceinline GSVector4i gather32_8(const T* ptr) const
{
return GSVector4i(
(int)ptr[extract8<src + 0>()],
(int)ptr[extract8<src + 1>()],
(int)ptr[extract8<src + 2>()],
(int)ptr[extract8<src + 3>()]);
}
template<int src, class T> __forceinline GSVector4i gather32_16(const T* ptr) const
{
return GSVector4i(
(int)ptr[extract16<src + 0>()],
(int)ptr[extract16<src + 1>()],
(int)ptr[extract16<src + 2>()],
(int)ptr[extract16<src + 3>()]);
}
template<class T> __forceinline GSVector4i gather32_32(const T* ptr) const
{
return GSVector4i(
(int)ptr[extract32<0>()],
(int)ptr[extract32<1>()],
(int)ptr[extract32<2>()],
(int)ptr[extract32<3>()]);
}
template<class T1, class T2> __forceinline GSVector4i gather32_32(const T1* ptr1, const T2* ptr2) const
{
return GSVector4i(
(int)ptr2[ptr1[extract32<0>()]],
(int)ptr2[ptr1[extract32<1>()]],
(int)ptr2[ptr1[extract32<2>()]],
(int)ptr2[ptr1[extract32<3>()]]);
}
#endif
#if defined(_M_AMD64) && _M_SSE >= 0x401
template<int src, class T> __forceinline GSVector4i gather64_4(const T* ptr) const
{
@@ -1246,6 +1299,8 @@ public:
#endif
#if _M_SSE >= 0x401
template<class T> __forceinline void gather8_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
{
dst[0] = gather8_4<0>(ptr);
@@ -1257,6 +1312,8 @@ public:
dst[0] = gather8_8<>(ptr);
}
#endif
template<class T> __forceinline void gather16_4(const T* RESTRICT ptr, GSVector4i* RESTRICT dst) const
{
dst[0] = gather16_4<0>(ptr);
@@ -1362,156 +1419,6 @@ public:
#endif
#endif
static GSVector4i zero()
{
return GSVector4i(_mm_setzero_si128());
}
static GSVector4i invzero()
{
return zero() == zero();
}
static GSVector4i x00000001()
{
return invzero().srl32(31);
}
static GSVector4i x0001()
{
return invzero().srl16(15);
}
static GSVector4i x00ff()
{
return invzero().srl16(8);
}
static GSVector4i xff00()
{
return invzero().sll16(8);
}
static GSVector4i x000000ff()
{
return invzero().srl32(24);
}
static GSVector4i x80000000()
{
return invzero().sll32(31);
}
static GSVector4i xff000000()
{
return invzero().sll32(24);
}
static GSVector4i xffff0000()
{
return invzero().sll32(16);
}
static GSVector4i x00000fff()
{
return invzero().srl32(20);
}
static GSVector4i x0000ffff()
{
return invzero().srl32(16);
}
static GSVector4i x00ffffff()
{
return invzero().srl32(8);
}
static GSVector4i x00003fff()
{
return invzero().srl32(18);
}
static GSVector4i x00007fff()
{
return invzero().srl32(17);
}
static GSVector4i invzero(const GSVector4i& v)
{
// - vc can't generate a simple pxor xmm0, xmm0 / pcmpeqd xmm0, xmm0
// - it is better if we just use whatever register there is at the moment, instead of loading the result of _mm_setzero_si128 from memory stored somewhere into a temp earlier (we could also load the final value then...)
return v == v;
}
static GSVector4i x00000001(const GSVector4i& v)
{
return invzero(v).srl32(31);
}
static GSVector4i x0001(const GSVector4i& v)
{
return invzero(v).srl16(15);
}
static GSVector4i x00ff(const GSVector4i& v)
{
return invzero(v).srl16(8);
}
static GSVector4i xff00(const GSVector4i& v)
{
return invzero(v).sll16(8);
}
static GSVector4i x000000ff(const GSVector4i& v)
{
return invzero(v).srl32(24);
}
static GSVector4i x80000000(const GSVector4i& v)
{
return invzero(v).sll32(31);
}
static GSVector4i xff000000(const GSVector4i& v)
{
return invzero(v).sll32(24);
}
static GSVector4i xffff0000(const GSVector4i& v)
{
return invzero(v).sll32(16);
}
static GSVector4i x00000fff(const GSVector4i& v)
{
return invzero(v).srl32(20);
}
static GSVector4i x0000ffff(const GSVector4i& v)
{
return invzero(v).srl32(16);
}
static GSVector4i x00ffffff(const GSVector4i& v)
{
return invzero(v).srl32(8);
}
static GSVector4i x00003fff(const GSVector4i& v)
{
return invzero(v).srl32(18);
}
static GSVector4i x00007fff(const GSVector4i& v)
{
return invzero(v).srl32(17);
}
#if _M_SSE >= 0x401
static GSVector4i loadnt(const void* p)
@@ -1706,7 +1613,7 @@ public:
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)dst;
GSVector4i v = GSVector4i::invzero();
GSVector4i v = GSVector4i::xffffffff();
for(int i = 0; i < size; i++)
{
@@ -1725,7 +1632,7 @@ public:
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)dst;
GSVector4i v = GSVector4i::invzero();
GSVector4i v = GSVector4i::xffffffff();
for(int i = 0; i < size; i++)
{
@@ -1912,6 +1819,204 @@ public:
VECTOR4i_SHUFFLE_1(y, 1)
VECTOR4i_SHUFFLE_1(z, 2)
VECTOR4i_SHUFFLE_1(w, 3)
static GSVector4i zero() {return GSVector4i(_mm_setzero_si128());}
static GSVector4i xffffffff() {return zero() == zero();}
static GSVector4i x00000001() {return xffffffff().srl32(31);}
static GSVector4i x00000003() {return xffffffff().srl32(30);}
static GSVector4i x00000007() {return xffffffff().srl32(29);}
static GSVector4i x0000000f() {return xffffffff().srl32(28);}
static GSVector4i x0000001f() {return xffffffff().srl32(27);}
static GSVector4i x0000003f() {return xffffffff().srl32(26);}
static GSVector4i x0000007f() {return xffffffff().srl32(25);}
static GSVector4i x000000ff() {return xffffffff().srl32(24);}
static GSVector4i x000001ff() {return xffffffff().srl32(23);}
static GSVector4i x000003ff() {return xffffffff().srl32(22);}
static GSVector4i x000007ff() {return xffffffff().srl32(21);}
static GSVector4i x00000fff() {return xffffffff().srl32(20);}
static GSVector4i x00001fff() {return xffffffff().srl32(19);}
static GSVector4i x00003fff() {return xffffffff().srl32(18);}
static GSVector4i x00007fff() {return xffffffff().srl32(17);}
static GSVector4i x0000ffff() {return xffffffff().srl32(16);}
static GSVector4i x0001ffff() {return xffffffff().srl32(15);}
static GSVector4i x0003ffff() {return xffffffff().srl32(14);}
static GSVector4i x0007ffff() {return xffffffff().srl32(13);}
static GSVector4i x000fffff() {return xffffffff().srl32(12);}
static GSVector4i x001fffff() {return xffffffff().srl32(11);}
static GSVector4i x003fffff() {return xffffffff().srl32(10);}
static GSVector4i x007fffff() {return xffffffff().srl32( 9);}
static GSVector4i x00ffffff() {return xffffffff().srl32( 8);}
static GSVector4i x01ffffff() {return xffffffff().srl32( 7);}
static GSVector4i x03ffffff() {return xffffffff().srl32( 6);}
static GSVector4i x07ffffff() {return xffffffff().srl32( 5);}
static GSVector4i x0fffffff() {return xffffffff().srl32( 4);}
static GSVector4i x1fffffff() {return xffffffff().srl32( 3);}
static GSVector4i x3fffffff() {return xffffffff().srl32( 2);}
static GSVector4i x7fffffff() {return xffffffff().srl32( 1);}
static GSVector4i x80000000() {return xffffffff().sll32(31);}
static GSVector4i xc0000000() {return xffffffff().sll32(30);}
static GSVector4i xe0000000() {return xffffffff().sll32(29);}
static GSVector4i xf0000000() {return xffffffff().sll32(28);}
static GSVector4i xf8000000() {return xffffffff().sll32(27);}
static GSVector4i xfc000000() {return xffffffff().sll32(26);}
static GSVector4i xfe000000() {return xffffffff().sll32(25);}
static GSVector4i xff000000() {return xffffffff().sll32(24);}
static GSVector4i xff800000() {return xffffffff().sll32(23);}
static GSVector4i xffc00000() {return xffffffff().sll32(22);}
static GSVector4i xffe00000() {return xffffffff().sll32(21);}
static GSVector4i xfff00000() {return xffffffff().sll32(20);}
static GSVector4i xfff80000() {return xffffffff().sll32(19);}
static GSVector4i xfffc0000() {return xffffffff().sll32(18);}
static GSVector4i xfffe0000() {return xffffffff().sll32(17);}
static GSVector4i xffff0000() {return xffffffff().sll32(16);}
static GSVector4i xffff8000() {return xffffffff().sll32(15);}
static GSVector4i xffffc000() {return xffffffff().sll32(14);}
static GSVector4i xffffe000() {return xffffffff().sll32(13);}
static GSVector4i xfffff000() {return xffffffff().sll32(12);}
static GSVector4i xfffff800() {return xffffffff().sll32(11);}
static GSVector4i xfffffc00() {return xffffffff().sll32(10);}
static GSVector4i xfffffe00() {return xffffffff().sll32( 9);}
static GSVector4i xffffff00() {return xffffffff().sll32( 8);}
static GSVector4i xffffff80() {return xffffffff().sll32( 7);}
static GSVector4i xffffffc0() {return xffffffff().sll32( 6);}
static GSVector4i xffffffe0() {return xffffffff().sll32( 5);}
static GSVector4i xfffffff0() {return xffffffff().sll32( 4);}
static GSVector4i xfffffff8() {return xffffffff().sll32( 3);}
static GSVector4i xfffffffc() {return xffffffff().sll32( 2);}
static GSVector4i xfffffffe() {return xffffffff().sll32( 1);}
static GSVector4i x0001() {return xffffffff().srl16(15);}
static GSVector4i x0003() {return xffffffff().srl16(14);}
static GSVector4i x0007() {return xffffffff().srl16(13);}
static GSVector4i x000f() {return xffffffff().srl16(12);}
static GSVector4i x001f() {return xffffffff().srl16(11);}
static GSVector4i x003f() {return xffffffff().srl16(10);}
static GSVector4i x007f() {return xffffffff().srl16( 9);}
static GSVector4i x00ff() {return xffffffff().srl16( 8);}
static GSVector4i x01ff() {return xffffffff().srl16( 7);}
static GSVector4i x03ff() {return xffffffff().srl16( 6);}
static GSVector4i x07ff() {return xffffffff().srl16( 5);}
static GSVector4i x0fff() {return xffffffff().srl16( 4);}
static GSVector4i x1fff() {return xffffffff().srl16( 3);}
static GSVector4i x3fff() {return xffffffff().srl16( 2);}
static GSVector4i x7fff() {return xffffffff().srl16( 1);}
static GSVector4i x8000() {return xffffffff().sll16(15);}
static GSVector4i xc000() {return xffffffff().sll16(14);}
static GSVector4i xe000() {return xffffffff().sll16(13);}
static GSVector4i xf000() {return xffffffff().sll16(12);}
static GSVector4i xf800() {return xffffffff().sll16(11);}
static GSVector4i xfc00() {return xffffffff().sll16(10);}
static GSVector4i xfe00() {return xffffffff().sll16( 9);}
static GSVector4i xff00() {return xffffffff().sll16( 8);}
static GSVector4i xff80() {return xffffffff().sll16( 7);}
static GSVector4i xffc0() {return xffffffff().sll16( 6);}
static GSVector4i xffe0() {return xffffffff().sll16( 5);}
static GSVector4i xfff0() {return xffffffff().sll16( 4);}
static GSVector4i xfff8() {return xffffffff().sll16( 3);}
static GSVector4i xfffc() {return xffffffff().sll16( 2);}
static GSVector4i xfffe() {return xffffffff().sll16( 1);}
static GSVector4i xffffffff(const GSVector4i& v) {return v == v;}
static GSVector4i x00000001(const GSVector4i& v) {return xffffffff(v).srl32(31);}
static GSVector4i x00000003(const GSVector4i& v) {return xffffffff(v).srl32(30);}
static GSVector4i x00000007(const GSVector4i& v) {return xffffffff(v).srl32(29);}
static GSVector4i x0000000f(const GSVector4i& v) {return xffffffff(v).srl32(28);}
static GSVector4i x0000001f(const GSVector4i& v) {return xffffffff(v).srl32(27);}
static GSVector4i x0000003f(const GSVector4i& v) {return xffffffff(v).srl32(26);}
static GSVector4i x0000007f(const GSVector4i& v) {return xffffffff(v).srl32(25);}
static GSVector4i x000000ff(const GSVector4i& v) {return xffffffff(v).srl32(24);}
static GSVector4i x000001ff(const GSVector4i& v) {return xffffffff(v).srl32(23);}
static GSVector4i x000003ff(const GSVector4i& v) {return xffffffff(v).srl32(22);}
static GSVector4i x000007ff(const GSVector4i& v) {return xffffffff(v).srl32(21);}
static GSVector4i x00000fff(const GSVector4i& v) {return xffffffff(v).srl32(20);}
static GSVector4i x00001fff(const GSVector4i& v) {return xffffffff(v).srl32(19);}
static GSVector4i x00003fff(const GSVector4i& v) {return xffffffff(v).srl32(18);}
static GSVector4i x00007fff(const GSVector4i& v) {return xffffffff(v).srl32(17);}
static GSVector4i x0000ffff(const GSVector4i& v) {return xffffffff(v).srl32(16);}
static GSVector4i x0001ffff(const GSVector4i& v) {return xffffffff(v).srl32(15);}
static GSVector4i x0003ffff(const GSVector4i& v) {return xffffffff(v).srl32(14);}
static GSVector4i x0007ffff(const GSVector4i& v) {return xffffffff(v).srl32(13);}
static GSVector4i x000fffff(const GSVector4i& v) {return xffffffff(v).srl32(12);}
static GSVector4i x001fffff(const GSVector4i& v) {return xffffffff(v).srl32(11);}
static GSVector4i x003fffff(const GSVector4i& v) {return xffffffff(v).srl32(10);}
static GSVector4i x007fffff(const GSVector4i& v) {return xffffffff(v).srl32( 9);}
static GSVector4i x00ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 8);}
static GSVector4i x01ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 7);}
static GSVector4i x03ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 6);}
static GSVector4i x07ffffff(const GSVector4i& v) {return xffffffff(v).srl32( 5);}
static GSVector4i x0fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 4);}
static GSVector4i x1fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 3);}
static GSVector4i x3fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 2);}
static GSVector4i x7fffffff(const GSVector4i& v) {return xffffffff(v).srl32( 1);}
static GSVector4i x80000000(const GSVector4i& v) {return xffffffff(v).sll32(31);}
static GSVector4i xc0000000(const GSVector4i& v) {return xffffffff(v).sll32(30);}
static GSVector4i xe0000000(const GSVector4i& v) {return xffffffff(v).sll32(29);}
static GSVector4i xf0000000(const GSVector4i& v) {return xffffffff(v).sll32(28);}
static GSVector4i xf8000000(const GSVector4i& v) {return xffffffff(v).sll32(27);}
static GSVector4i xfc000000(const GSVector4i& v) {return xffffffff(v).sll32(26);}
static GSVector4i xfe000000(const GSVector4i& v) {return xffffffff(v).sll32(25);}
static GSVector4i xff000000(const GSVector4i& v) {return xffffffff(v).sll32(24);}
static GSVector4i xff800000(const GSVector4i& v) {return xffffffff(v).sll32(23);}
static GSVector4i xffc00000(const GSVector4i& v) {return xffffffff(v).sll32(22);}
static GSVector4i xffe00000(const GSVector4i& v) {return xffffffff(v).sll32(21);}
static GSVector4i xfff00000(const GSVector4i& v) {return xffffffff(v).sll32(20);}
static GSVector4i xfff80000(const GSVector4i& v) {return xffffffff(v).sll32(19);}
static GSVector4i xfffc0000(const GSVector4i& v) {return xffffffff(v).sll32(18);}
static GSVector4i xfffe0000(const GSVector4i& v) {return xffffffff(v).sll32(17);}
static GSVector4i xffff0000(const GSVector4i& v) {return xffffffff(v).sll32(16);}
static GSVector4i xffff8000(const GSVector4i& v) {return xffffffff(v).sll32(15);}
static GSVector4i xffffc000(const GSVector4i& v) {return xffffffff(v).sll32(14);}
static GSVector4i xffffe000(const GSVector4i& v) {return xffffffff(v).sll32(13);}
static GSVector4i xfffff000(const GSVector4i& v) {return xffffffff(v).sll32(12);}
static GSVector4i xfffff800(const GSVector4i& v) {return xffffffff(v).sll32(11);}
static GSVector4i xfffffc00(const GSVector4i& v) {return xffffffff(v).sll32(10);}
static GSVector4i xfffffe00(const GSVector4i& v) {return xffffffff(v).sll32( 9);}
static GSVector4i xffffff00(const GSVector4i& v) {return xffffffff(v).sll32( 8);}
static GSVector4i xffffff80(const GSVector4i& v) {return xffffffff(v).sll32( 7);}
static GSVector4i xffffffc0(const GSVector4i& v) {return xffffffff(v).sll32( 6);}
static GSVector4i xffffffe0(const GSVector4i& v) {return xffffffff(v).sll32( 5);}
static GSVector4i xfffffff0(const GSVector4i& v) {return xffffffff(v).sll32( 4);}
static GSVector4i xfffffff8(const GSVector4i& v) {return xffffffff(v).sll32( 3);}
static GSVector4i xfffffffc(const GSVector4i& v) {return xffffffff(v).sll32( 2);}
static GSVector4i xfffffffe(const GSVector4i& v) {return xffffffff(v).sll32( 1);}
static GSVector4i x0001(const GSVector4i& v) {return xffffffff(v).srl16(15);}
static GSVector4i x0003(const GSVector4i& v) {return xffffffff(v).srl16(14);}
static GSVector4i x0007(const GSVector4i& v) {return xffffffff(v).srl16(13);}
static GSVector4i x000f(const GSVector4i& v) {return xffffffff(v).srl16(12);}
static GSVector4i x001f(const GSVector4i& v) {return xffffffff(v).srl16(11);}
static GSVector4i x003f(const GSVector4i& v) {return xffffffff(v).srl16(10);}
static GSVector4i x007f(const GSVector4i& v) {return xffffffff(v).srl16( 9);}
static GSVector4i x00ff(const GSVector4i& v) {return xffffffff(v).srl16( 8);}
static GSVector4i x01ff(const GSVector4i& v) {return xffffffff(v).srl16( 7);}
static GSVector4i x03ff(const GSVector4i& v) {return xffffffff(v).srl16( 6);}
static GSVector4i x07ff(const GSVector4i& v) {return xffffffff(v).srl16( 5);}
static GSVector4i x0fff(const GSVector4i& v) {return xffffffff(v).srl16( 4);}
static GSVector4i x1fff(const GSVector4i& v) {return xffffffff(v).srl16( 3);}
static GSVector4i x3fff(const GSVector4i& v) {return xffffffff(v).srl16( 2);}
static GSVector4i x7fff(const GSVector4i& v) {return xffffffff(v).srl16( 1);}
static GSVector4i x8000(const GSVector4i& v) {return xffffffff(v).sll16(15);}
static GSVector4i xc000(const GSVector4i& v) {return xffffffff(v).sll16(14);}
static GSVector4i xe000(const GSVector4i& v) {return xffffffff(v).sll16(13);}
static GSVector4i xf000(const GSVector4i& v) {return xffffffff(v).sll16(12);}
static GSVector4i xf800(const GSVector4i& v) {return xffffffff(v).sll16(11);}
static GSVector4i xfc00(const GSVector4i& v) {return xffffffff(v).sll16(10);}
static GSVector4i xfe00(const GSVector4i& v) {return xffffffff(v).sll16( 9);}
static GSVector4i xff00(const GSVector4i& v) {return xffffffff(v).sll16( 8);}
static GSVector4i xff80(const GSVector4i& v) {return xffffffff(v).sll16( 7);}
static GSVector4i xffc0(const GSVector4i& v) {return xffffffff(v).sll16( 6);}
static GSVector4i xffe0(const GSVector4i& v) {return xffffffff(v).sll16( 5);}
static GSVector4i xfff0(const GSVector4i& v) {return xffffffff(v).sll16( 4);}
static GSVector4i xfff8(const GSVector4i& v) {return xffffffff(v).sll16( 3);}
static GSVector4i xfffc(const GSVector4i& v) {return xffffffff(v).sll16( 2);}
static GSVector4i xfffe(const GSVector4i& v) {return xffffffff(v).sll16( 1);}
};
__declspec(align(16)) class GSVector4
@@ -1941,7 +2046,7 @@ public:
{
}
GSVector4(float x, float y, float z = 0.5f, float w = 1.0f)
GSVector4(float x, float y, float z, float w)
{
m = _mm_set_ps(w, z, y, x);
}
@@ -2211,7 +2316,7 @@ public:
return GSVector4(_mm_setzero_ps());
}
static GSVector4 invzero()
static GSVector4 xffffffff()
{
return zero() == zero();
}
@@ -2231,6 +2336,11 @@ public:
return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p)));
}
static GSVector4 load(float f)
{
return GSVector4(_mm_load_ss(&f));
}
template<bool aligned> static GSVector4 load(const void* p)
{
return GSVector4i(aligned ? _mm_load_ps((__m128*)p) : _mm_loadu_ps((__m128*)p));
@@ -2249,7 +2359,7 @@ public:
__forceinline static void expand(const GSVector4i& v, GSVector4& a, GSVector4& b, GSVector4& c, GSVector4& d)
{
GSVector4i mask = GSVector4i::x000000ff(v);
GSVector4i mask = GSVector4i::x000000ff();
a = v & mask;
b = (v >> 8) & mask;