GSdx: The sw renderer now uses avx2, not much faster though, +10% maybe, if the game is not EE limited. I'm not sure if haswell has that much better sse execution (load/store units doubled for example), or the avx2 code is not fully optimized yet.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5677 96395faa-99c1-11dd-bbfe-3dabce05a288
2025-02-18 16:39:18 +00:00 · 2013-06-20 05:07:52 +00:00 · 2013-06-20 05:07:52 +00:00 · d20bd4f86a
commit d20bd4f86a
parent 3b753bec42
20 changed files with 5021 additions and 88 deletions
--- a/plugins/GSdx/GSBlock.h
+++ b/plugins/GSdx/GSBlock.h
@ -1789,23 +1789,39 @@ public:
 		GSVector8i TA0(TEXA.TA0 << 24);
 		GSVector8i mask = GSVector8i::x00ffffff();

-		for(int i = 0; i < 4; i++, dst += dstpitch * 2)
-		{
-			GSVector8i v0 = s[i * 2 + 0];
-			GSVector8i v1 = s[i * 2 + 1];
+		GSVector8i v0, v1, v2, v3;

-			GSVector8i::sw128(v0, v1);
-			GSVector8i::sw64(v0, v1);
+		v0 = s[0] & mask;
+		v1 = s[1] & mask;
+		v2 = s[2] & mask;
+		v3 = s[3] & mask;

-			v0 &= mask;
-			v1 &= mask;
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw64(v0, v1);
+		GSVector8i::sw128(v2, v3);
+		GSVector8i::sw64(v2, v3);

-			GSVector8i* d0 = (GSVector8i*)&dst[dstpitch * 0];
-			GSVector8i* d1 = (GSVector8i*)&dst[dstpitch * 1];
+		*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
+		*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
+		*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
+		*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);

-			d0[0] = Expand24to32<AEM>(v0, TA0);
-			d1[0] = Expand24to32<AEM>(v1, TA0);
-		}
+		v0 = s[4] & mask;
+		v1 = s[5] & mask;
+		v2 = s[6] & mask;
+		v3 = s[7] & mask;
+
+		GSVector8i::sw128(v0, v1);
+		GSVector8i::sw64(v0, v1);
+		GSVector8i::sw128(v2, v3);
+		GSVector8i::sw64(v2, v3);
+
+		dst += dstpitch * 4;
+
+		*(GSVector8i*)&dst[dstpitch * 0] = Expand24to32<AEM>(v0, TA0);
+		*(GSVector8i*)&dst[dstpitch * 1] = Expand24to32<AEM>(v1, TA0);
+		*(GSVector8i*)&dst[dstpitch * 2] = Expand24to32<AEM>(v2, TA0);
+		*(GSVector8i*)&dst[dstpitch * 3] = Expand24to32<AEM>(v3, TA0);

 		#else

--- a/plugins/GSdx/GSDrawScanline.cpp
+++ b/plugins/GSdx/GSDrawScanline.cpp
--- a/plugins/GSdx/GSDrawScanline.h
+++ b/plugins/GSdx/GSDrawScanline.h
@ -81,8 +81,8 @@ public:
 	bool IsEdge() const {return m_global.sel.aa1;}
 	bool IsRect() const {return m_global.sel.IsSolidRect();}

-	bool TestAlpha(GSVector4i& test, GSVector4i& fm, GSVector4i& zm, const GSVector4i& ga);
-	void WritePixel(const GSVector4i& src, int addr, int i, uint32 psm);
+	template<class T> bool TestAlpha(T& test, T& fm, T& zm, const T& ga);
+	template<class T> void WritePixel(const T& src, int addr, int i, uint32 psm);

 #endif

--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp
@ -22,6 +22,38 @@
 #include "stdafx.h"
 #include "GSDrawScanlineCodeGenerator.h"

+#if _M_SSE >= 0x501
+
+const GSVector8i GSDrawScanlineCodeGenerator::m_test[16] =
+{
+	GSVector8i::zero(),
+	GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
+	GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
+	GSVector8i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector8i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
+	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff),
+	GSVector8i::zero(),
+};
+
+const GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4] = 
+{
+	GSVector8(0.204446009836232697516f),
+	GSVector8(-1.04913055217340124191f),
+	GSVector8(2.28330284476918490682f),
+	GSVector8(1.0f),
+};
+
+#else
+
 const GSVector4i GSDrawScanlineCodeGenerator::m_test[8] =
 {
 	GSVector4i::zero(),
@ -42,6 +74,8 @@ const GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4] =
 	GSVector4(1.0f),
 };

+#endif
+
 GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
 	: GSCodeGenerator(code, maxsize)
 	, m_local(*(GSScanlineLocalData*)param)
@ -51,6 +85,81 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
 	Generate();
 }

+#if _M_SSE >= 0x501
+
+void GSDrawScanlineCodeGenerator::modulate16(const Ymm& a, const Operand& f, int shift)
+{
+	if(shift == 0)
+	{
+		vpmulhrsw(a, f);
+	}
+	else
+	{
+		vpsllw(a, (uint8)(shift + 1));
+		vpmulhw(a, f);
+	}
+}
+
+void GSDrawScanlineCodeGenerator::lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift)
+{
+	vpsubw(a, b);
+	modulate16(a, f, shift);
+	vpaddw(a, b);
+}
+
+void GSDrawScanlineCodeGenerator::lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f)
+{
+	vpsubw(a, b);
+	vpmullw(a, f);
+	vpsraw(a, 4);
+	vpaddw(a, b);
+}
+
+void GSDrawScanlineCodeGenerator::mix16(const Ymm& a, const Ymm& b, const Ymm& temp)
+{
+	vpblendw(a, b, 0xaa);
+}
+
+void GSDrawScanlineCodeGenerator::clamp16(const Ymm& a, const Ymm& temp)
+{
+	vpackuswb(a, a);
+	vpermq(a, a, _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
+	vpmovzxbw(a, a);
+}
+
+void GSDrawScanlineCodeGenerator::alltrue()
+{
+	vpmovmskb(eax, ymm7);
+	cmp(eax, 0xffffffff);
+	je("step", T_NEAR);
+}
+
+void GSDrawScanlineCodeGenerator::blend(const Ymm& a, const Ymm& b, const Ymm& mask)
+{
+	vpand(b, mask);
+	vpandn(mask, a);
+	vpor(a, b, mask);
+}
+
+void GSDrawScanlineCodeGenerator::blendr(const Ymm& b, const Ymm& a, const Ymm& mask)
+{
+	vpand(b, mask);
+	vpandn(mask, a);
+	vpor(b, mask);
+}
+
+void GSDrawScanlineCodeGenerator::blend8(const Ymm& a, const Ymm& b)
+{
+	vpblendvb(a, a, b, xmm0);
+}
+
+void GSDrawScanlineCodeGenerator::blend8r(const Ymm& b, const Ymm& a)
+{
+	vpblendvb(b, a, b, xmm0);
+}
+
+#else
+
 void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, int shift)
 {
 	#if _M_SSE >= 0x500
@ -244,3 +353,5 @@ void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)

 	#endif
 }
+
+#endif
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h
@ -35,6 +35,55 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator

 	void Generate();

+	#if _M_SSE >= 0x501
+
+	void Init();
+	void Step();
+	void TestZ(const Ymm& temp1, const Ymm& temp2);
+	void SampleTexture();
+	void Wrap(const Ymm& uv0);
+	void Wrap(const Ymm& uv0, const Ymm& uv1);
+	void SampleTextureLOD();
+	void WrapLOD(const Ymm& uv0);
+	void WrapLOD(const Ymm& uv0, const Ymm& uv1);
+	void AlphaTFX();
+	void ReadMask();
+	void TestAlpha();
+	void ColorTFX();
+	void Fog();
+	void ReadFrame();
+	void TestDestAlpha();
+	void WriteMask();
+	void WriteZBuf();
+	void AlphaBlend();
+	void WriteFrame();
+
+	#if defined(_M_AMD64) || defined(_WIN64)
+	void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg64& addr);
+	void WritePixel(const Ymm& src, const Ymm& temp, const Reg64& addr, const Reg32& mask, bool fast, int psm, int fz);
+	void WritePixel(const Xmm& src, const Reg64& addr, uint8 i, uint8 j, int psm);
+	#else
+	void ReadPixel(const Ymm& dst, const Ymm& temp, const Reg32& addr);
+	void WritePixel(const Ymm& src, const Ymm& temp, const Reg32& addr, const Reg32& mask, bool fast, int psm, int fz);
+	void WritePixel(const Xmm& src, const Reg32& addr, uint8 i, uint8 j, int psm);
+	#endif
+
+	void ReadTexel(int pixels, int mip_offset = 0);
+	void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
+
+	void modulate16(const Ymm& a, const Operand& f, int shift);
+	void lerp16(const Ymm& a, const Ymm& b, const Ymm& f, int shift);
+	void lerp16_4(const Ymm& a, const Ymm& b, const Ymm& f);
+	void mix16(const Ymm& a, const Ymm& b, const Ymm& temp);
+	void clamp16(const Ymm& a, const Ymm& temp);
+	void alltrue();
+	void blend(const Ymm& a, const Ymm& b, const Ymm& mask);
+	void blendr(const Ymm& b, const Ymm& a, const Ymm& mask);
+	void blend8(const Ymm& a, const Ymm& b);
+	void blend8r(const Ymm& b, const Ymm& a);
+
+	#else
+
 	void Init();
 	void Step();
 	void TestZ(const Xmm& temp1, const Xmm& temp2);
@ -80,9 +129,17 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
 	void blend8(const Xmm& a, const Xmm& b);
 	void blend8r(const Xmm& b, const Xmm& a);

+	#endif
+
 public:
 	GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);

+	#if _M_SSE >= 0x501
+	static const GSVector8i m_test[16];
+	static const GSVector8 m_log2_coef[4];
+	#else
 	static const GSVector4i m_test[8];
 	static const GSVector4 m_log2_coef[4];
+	#endif
+
 };
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp
@ -23,7 +23,7 @@
 #include "GSDrawScanlineCodeGenerator.h"
 #include "GSVertexSW.h"

-#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
+#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))

 static const int _args = 16;
 static const int _top = _args + 4;
@ -1236,24 +1236,21 @@ return;

 			// m_local.gd->t.minmax => m_local.temp.uv_minmax[0/1]

-			vmovq(xmm4, ptr[&m_local.gd->t.minmax]); // x x x x maxv maxu minv minu
-			vpunpcklwd(xmm4, xmm4); // maxv maxv maxu maxu minv minv minu minu
-
 			vpxor(xmm1, xmm1);
-			
-			vpunpckldq(xmm6, xmm4, xmm4); // minv minv minv minv minu minu minu minu
-			vpunpcklwd(xmm5, xmm6, xmm1); // 0 minu 0 minu 0 minu 0 minu
+
+			vmovdqa(xmm4, ptr[&m_local.gd->t.min]);
+			vpunpcklwd(xmm5, xmm4, xmm1); // minu
+			vpunpckhwd(xmm6, xmm4, xmm1); // minv
 			vpsrlvd(xmm5, xmm5, xmm0);
-			vpunpckhwd(xmm6, xmm6, xmm1); // 0 minv 0 minv 0 minv 0 minv
 			vpsrlvd(xmm6, xmm6, xmm0);
-			vpackusdw(xmm5, xmm6); // xmm5 = minv minv minv minv minu minu minu minu
-			
-			vpunpckhdq(xmm4, xmm4); // maxv maxv maxv maxv maxu maxu maxu maxu
-			vpunpcklwd(xmm6, xmm4, xmm1); // 0 maxu 0 maxu 0 maxu 0 maxu
+			vpackusdw(xmm5, xmm6);
+
+			vmovdqa(xmm4, ptr[&m_local.gd->t.max]);
+			vpunpcklwd(xmm6, xmm4, xmm1); // maxu
+			vpunpckhwd(xmm4, xmm4, xmm1); // maxv
 			vpsrlvd(xmm6, xmm6, xmm0);
-			vpunpckhwd(xmm4, xmm1); // 0 maxv 0 maxv 0 maxv 0 maxv
 			vpsrlvd(xmm4, xmm4, xmm0);
-			vpackusdw(xmm6, xmm4); // xmm6 = maxv maxv maxv maxv maxu maxu maxu maxu
+			vpackusdw(xmm6, xmm4);

 			vmovdqa(ptr[&m_local.temp.uv_minmax[0]], xmm5);
 			vmovdqa(ptr[&m_local.temp.uv_minmax[1]], xmm6);
@ -2807,7 +2804,7 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr,
 	}
 }

-static const int s_offsets[4] = {0, 2, 8, 10};
+static const int s_offsets[] = {0, 2, 8, 10};

 void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, uint8 i, int psm)
 {
@ -2865,7 +2862,7 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
 			vmovdqa(ptr[&m_local.temp.test], xmm7);
 		}

-		for(int j = 0; j < 4; j++)
+		for(uint8 j = 0; j < 4; j++)
 		{
 			mov(ebx, ptr[&lod_i->u32[j]]);
 			mov(ebx, ptr[ebp + ebx * sizeof(void*) + mip_offset]);
@ -2895,18 +2892,9 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)

 		for(int i = 0; i < pixels; i++)
 		{
-			if(m_cpu.has(util::Cpu::tAVX2) && !m_sel.tlu) // vpgatherdd seems to be dead slow for byte aligned offsets, not using it for palette lookups
+			for(uint8 j = 0; j < 4; j++)
 			{
-				Xmm mask = Xmm(t[i]);
-				vpcmpeqd(mask, mask);
-				vpgatherdd(Xmm(r[i * 2 + 1]), ptr[ebx + Xmm(r[i * 2 + 0]) * 4], mask);
-			}
-			else
-			{
-				for(int j = 0; j < 4; j++)
-				{
-					ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
-				}
+				ReadTexel(Xmm(r[i * 2 + 1]), Xmm(r[i * 2 + 0]), j);
 			}
 		}
 	}
@ -2914,6 +2902,8 @@ void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)

 void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
 {
+	ASSERT(i < 4);
+
 	const Address& src = m_sel.tlu ? ptr[edx + eax * 4] : ptr[ebx + eax * 4];

 	if(i == 0) vmovd(eax, addr);
--- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
+++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx2.cpp
--- a/plugins/GSdx/GSFunctionMap.h
+++ b/plugins/GSdx/GSFunctionMap.h
@ -209,7 +209,7 @@ public:

 				iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &ml);
 /*
-				name = format("c:/temp/%s_%016llx.bin", m_name.c_str(), (uint64)key);
+				name = format("c:/temp1/%s_%016llx.bin", m_name.c_str(), (uint64)key);

 				if(FILE* fp = fopen(name.c_str(), "wb"))
 				{
@ -218,7 +218,7 @@ public:
 					fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);

 					fwrite(cg->getCode(), cg->getSize(), 1, fp);
-					
+
 					fputc(0xBB, fp); fputc(0xDE, fp); fputc(0x00, fp); fputc(0x00, fp); fputc(0x00, fp);
 					fputc(0x64, fp); fputc(0x67, fp); fputc(0x90, fp);
 					fputc(0x0F, fp); fputc(0x0B, fp);
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@ -208,6 +208,10 @@ void GSRasterizer::Draw(GSRasterizerData* data)
 		__assume(0);
 	}

+	#if _M_SSE >= 0x501
+	_mm256_zeroupper();
+	#endif
+
 	data->pixels = m_pixels;

 	uint64 ticks = __rdtsc() - data->start;
@ -917,7 +921,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
 	{
 		for(int i = 0; i < threads; i++, row++)
 		{
-			m_scanline[row] = i;
+			m_scanline[row] = (uint8)i;
 		}
 	}
 }
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@ -400,7 +400,7 @@ void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex*

 		dst->t = t;

-		#if _M_SSE >= 0x501
+		#if 0 //_M_SSE >= 0x501

 		dst->_pad = GSVector4::zero();

@ -1342,8 +1342,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 		{
 			gd.sel.fge = 1;

-			gd.frb = GSVector4i((int)env.FOGCOL.u32[0] & 0x00ff00ff);
-			gd.fga = GSVector4i((int)(env.FOGCOL.u32[0] >> 8) & 0x00ff00ff);
+			gd.frb = env.FOGCOL.u32[0] & 0x00ff00ff;
+			gd.fga = (env.FOGCOL.u32[0] >> 8) & 0x00ff00ff;
 		}

 		if(context->FRAME.PSM != PSM_PSMCT24)
@ -1403,6 +1403,34 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 		gd.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000;
 	}

+	#if _M_SSE >= 0x501
+
+	gd.fm = fm;
+	gd.zm = zm;
+
+	if(gd.sel.fpsm == 1)
+	{
+		gd.fm |= 0xff000000;
+	}
+	else if(gd.sel.fpsm == 2)
+	{
+		uint32 rb = gd.fm & 0x00f800f8;
+		uint32 ga = gd.fm & 0x8000f800;
+
+		gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000;
+	}
+
+	if(gd.sel.zpsm == 1)
+	{
+		gd.zm |= 0xff000000;
+	}
+	else if(gd.sel.zpsm == 2)
+	{
+		gd.zm |= 0xffff0000;
+	}
+
+	#else
+
 	gd.fm = GSVector4i(fm);
 	gd.zm = GSVector4i(zm);

@ -1427,6 +1455,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)
 		gd.zm |= GSVector4i::xffff0000();
 	}

+	#endif
+
 	if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) // TODO: check scissor horizontally only
 	{
 		gd.sel.notest = 1;
@ -1435,7 +1465,11 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data)

 		for(int i = 0, j = m_vertex.tail; i < j; i++)
 		{
+			#if _M_SSE >= 0x501
+			if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8
+			#else
 			if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4
+			#endif
 			{
 				gd.sel.notest = 0;
 			
--- a/plugins/GSdx/GSScanlineEnvironment.h
+++ b/plugins/GSdx/GSScanlineEnvironment.h
@ -116,7 +116,7 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
 	void* vm;
 	const void* tex[7];
 	uint32* clut;
-	GSVector4i* dimx; 
+	GSVector4i* dimx;

 	const int* fbr;
 	const int* zbr;
@ -125,19 +125,63 @@ __aligned(struct, 32) GSScanlineGlobalData // per batch variables, this is like
 	const GSVector2i* fzbr;
 	const GSVector2i* fzbc;

-	GSVector4i fm, zm;
-	struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
 	GSVector4i aref;
 	GSVector4i afix;
+	struct {GSVector4i min, max, minmax, mask, invmask;} t; // [u] x 4 [v] x 4
+
+	#if _M_SSE >= 0x501
+
+	uint32 fm, zm;
+	uint32 frb, fga;
+	GSVector8 mxl;
+	GSVector8 k; // TEX1.K * 0x10000
+	GSVector8 l; // TEX1.L * -0x10000
+	struct {GSVector8i i, f;} lod; // lcm == 1
+
+	#else
+
+	GSVector4i fm, zm;
 	GSVector4i frb, fga;
 	GSVector4 mxl;
 	GSVector4 k; // TEX1.K * 0x10000
 	GSVector4 l; // TEX1.L * -0x10000
 	struct {GSVector4i i, f;} lod; // lcm == 1
+
+	#endif
 };

 __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has its own
 {
+	#if _M_SSE >= 0x501
+
+	struct skip {GSVector8 z, s, t, q; GSVector8i rb, ga, f, _pad;} d[8];
+	struct step {GSVector8 z, stq; GSVector8i c, f;} d8;
+	struct {GSVector8i rb, ga;} c;
+	struct {uint32 z, f;} p;
+
+	// these should be stored on stack as normal local variables (no free regs to use, esp cannot be saved to anywhere, and we need an aligned stack)
+
+	struct 
+	{
+		GSVector8 z, zo;
+		GSVector8i f;
+		GSVector8 s, t, q;
+		GSVector8i rb, ga;
+		GSVector8i zs, zd;
+		GSVector8i uf, vf;
+		GSVector8i cov;
+
+		// mipmapping
+
+		struct {GSVector8i i, f;} lod;
+		GSVector8i uv[2];
+		GSVector8i uv_minmax[2];
+		GSVector8i trb, tga;
+		GSVector8i test;
+	} temp; 
+
+	#else
+
 	struct skip {GSVector4 z, s, t, q; GSVector4i rb, ga, f, _pad;} d[4];
 	struct step {GSVector4 z, stq; GSVector4i c, f;} d4;
 	struct {GSVector4i rb, ga;} c;
@ -164,6 +208,8 @@ __aligned(struct, 32) GSScanlineLocalData // per prim variables, each thread has
 		GSVector4i test;
 	} temp; 

+	#endif
+
 	//

 	const GSScanlineGlobalData* gd;
--- a/plugins/GSdx/GSSetupPrimCodeGenerator.cpp
+++ b/plugins/GSdx/GSSetupPrimCodeGenerator.cpp
@ -22,6 +22,23 @@
 #include "stdafx.h"
 #include "GSSetupPrimCodeGenerator.h"

+#if _M_SSE >= 0x501
+
+const GSVector8 GSSetupPrimCodeGenerator::m_shift[9] =
+{
+	GSVector8(8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f),
+	GSVector8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
+	GSVector8(-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f),
+	GSVector8(-2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f),
+	GSVector8(-3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f),
+	GSVector8(-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f),
+	GSVector8(-5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f),
+	GSVector8(-6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f),
+	GSVector8(-7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f),
+};
+
+#else
+
 const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
 {
 	GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
@ -31,6 +48,8 @@ const GSVector4 GSSetupPrimCodeGenerator::m_shift[5] =
 	GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
 };

+#endif
+
 GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
 	: GSCodeGenerator(code, maxsize)
 	, m_local(*(GSScanlineLocalData*)param)
--- a/plugins/GSdx/GSSetupPrimCodeGenerator.h
+++ b/plugins/GSdx/GSSetupPrimCodeGenerator.h
@ -42,5 +42,9 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
 public:
 	GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);

+	#if _M_SSE >= 0x501
+	static const GSVector8 m_shift[9];
+	#else
 	static const GSVector4 m_shift[5];
+	#endif
 };
--- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp
+++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp
@ -23,7 +23,7 @@
 #include "GSSetupPrimCodeGenerator.h"
 #include "GSVertexSW.h"

-#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
+#if _M_SSE == 0x500 && !(defined(_M_AMD64) || defined(_WIN64))

 using namespace Xbyak;

--- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp
+++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx2.cpp
@ -0,0 +1,353 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#include "stdafx.h"
+#include "GSSetupPrimCodeGenerator.h"
+#include "GSVertexSW.h"
+
+#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
+
+using namespace Xbyak;
+
+static const int _args = 0;
+static const int _vertex = _args + 4;
+static const int _index = _args + 8;
+static const int _dscan = _args + 12;
+
+void GSSetupPrimCodeGenerator::Generate()
+{
+	if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
+	{
+		mov(edx, dword[esp + _dscan]);
+
+		for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
+		{
+			vmovaps(Ymm(3 + i), ptr[&m_shift[i]]);
+		}
+	}
+
+	Depth();
+
+	Texture();
+
+	Color();
+
+	ret();
+}
+
+void GSSetupPrimCodeGenerator::Depth()
+{
+	if(!m_en.z && !m_en.f)
+	{
+		return;
+	}
+
+	if(m_sel.prim != GS_SPRITE_CLASS)
+	{
+		// GSVector4 p = dscan.p;
+
+		if(m_en.f)
+		{
+			// GSVector8 df = GSVector8::broadcast32(dscan.p.wwww());
+
+			vbroadcastss(ymm1, ptr[edx + offsetof(GSVertexSW, p.w)]);
+		}
+
+		if(m_en.z)
+		{
+			// GSVector8 dz = GSVector8::broadcast32(dscan.p.zzzz());
+
+			vbroadcastss(ymm2, ptr[edx + offsetof(GSVertexSW, p.z)]);
+		}
+
+		if(m_en.f)
+		{
+			// m_local.d8.f = GSVector8i(df * shift[0]).xxzzlh();
+
+			vmulps(ymm0, ymm1, ymm3);
+			vcvttps2dq(ymm0, ymm0);
+			vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+			vmovdqa(ptr[&m_local.d8.f], ymm0);
+		}
+
+		if(m_en.z)
+		{
+			// m_local.d8.z = dz * shift[0];
+
+			vmulps(ymm0, ymm2, ymm3);
+			vmovaps(ptr[&m_local.d8.z], ymm0);
+		}
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			if(m_en.f)
+			{
+				// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
+
+				if(i < 4) vmulps(ymm0, ymm1, Ymm(4 + i));
+				else vmulps(ymm0, ymm1, ptr[&m_shift[i + 1]]);
+				vcvttps2dq(ymm0, ymm0);
+				vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+				vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
+				vmovdqa(ptr[&m_local.d[i].f], ymm0);
+			}
+
+			if(m_en.z)
+			{
+				// m_local.d[i].z = dz * shift[1 + i];
+
+				if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
+				else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
+				vmovaps(ptr[&m_local.d[i].z], ymm0);
+			}
+		}
+	}
+	else
+	{
+		// GSVector4 p = vertex[index[1]].p;
+
+		mov(ecx, ptr[esp + _index]);
+		mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
+		shl(ecx, 6); // * sizeof(GSVertexSW)
+		add(ecx, ptr[esp + _vertex]);
+
+		if(m_en.f)
+		{
+			// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
+
+			vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
+			vcvttps2dq(xmm0, xmm0);
+			vpextrd(ptr[&m_local.p.f], xmm0, 3);
+		}
+
+		if(m_en.z)
+		{
+			// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
+
+			mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
+			mov(ptr[&m_local.p.z], eax);
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Texture()
+{
+	if(!m_en.t)
+	{
+		return;
+	}
+
+	// GSVector8 dt(dscan.t);
+
+	vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
+
+	// GSVector8 dt8 = dt * shift[0];
+
+	vmulps(ymm1, ymm0, ymm3);
+
+	if(m_sel.fst)
+	{
+		// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
+
+		vcvttps2dq(ymm1, ymm1);
+
+		vmovdqa(ptr[&m_local.d8.stq], ymm1);
+	}
+	else
+	{
+		// m_local.d8.stq = dt8;
+
+		vmovaps(ptr[&m_local.d8.stq], ymm1);
+	}
+
+	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
+	{
+		// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
+
+		vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			// GSVector8 v = dstq * shift[1 + i];
+
+			if(i < 4) vmulps(ymm2, ymm1, Ymm(4 + i));
+			else vmulps(ymm2, ymm1, ptr[&m_shift[i + 1]]);
+
+			if(m_sel.fst)
+			{
+				// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
+
+				vcvttps2dq(ymm2, ymm2);
+
+				switch(j)
+				{
+				case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
+				case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
+				}
+			}
+			else
+			{
+				// m_local.d[i].s/t/q = v;
+
+				switch(j)
+				{
+				case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
+				case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
+				case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
+				}
+			}
+		}
+	}
+}
+
+void GSSetupPrimCodeGenerator::Color()
+{
+	if(!m_en.c)
+	{
+		return;
+	}
+
+	if(m_sel.iip)
+	{
+		// GSVector8 dc(dscan.c);
+
+		vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
+
+		// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
+
+		vmulps(ymm1, ymm0, ymm3);
+		vcvttps2dq(ymm1, ymm1);
+		vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
+		vpackssdw(ymm1, ymm1);
+		vmovdqa(ptr[&m_local.d8.c], ymm1);
+
+		// ymm3 is not needed anymore
+
+		// GSVector8 dr = dc.xxxx();
+		// GSVector8 db = dc.zzzz();
+
+		vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
+			else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm0, ymm0);
+			vpackssdw(ymm0, ymm0);
+
+			// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
+			else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm1, ymm1);
+			vpackssdw(ymm1, ymm1);
+
+			// m_local.d[i].rb = r.upl16(b);
+
+			vpunpcklwd(ymm0, ymm1);
+			vmovdqa(ptr[&m_local.d[i].rb], ymm0);
+		}
+
+		// GSVector8 dc(dscan.c);
+
+		vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
+
+		// GSVector8 dg = dc.yyyy();
+		// GSVector8 da = dc.wwww();
+
+		vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
+		vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
+
+		for(int i = 0; i < (m_sel.notest ? 1 : 8); i++)
+		{
+			// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
+			else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm0, ymm0);
+			vpackssdw(ymm0, ymm0);
+
+			// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
+
+			if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
+			else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
+			vcvttps2dq(ymm1, ymm1);
+			vpackssdw(ymm1, ymm1);
+
+			// m_local.d[i].ga = g.upl16(a);
+
+			vpunpcklwd(ymm0, ymm1);
+			vmovdqa(ptr[&m_local.d[i].ga], ymm0);
+		}
+	}
+	else
+	{
+		// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
+
+		int last = 0;
+
+		switch(m_sel.prim)
+		{
+		case GS_POINT_CLASS: last = 0; break;
+		case GS_LINE_CLASS: last = 1; break;
+		case GS_TRIANGLE_CLASS: last = 2; break;
+		case GS_SPRITE_CLASS: last = 1; break;
+		}
+
+		if(!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
+		{
+			mov(ecx, ptr[esp + _index]);
+			mov(ecx, ptr[ecx + sizeof(uint32) * last]);
+			shl(ecx, 6); // * sizeof(GSVertexSW)
+			add(ecx, ptr[esp + _vertex]);
+		}
+
+		vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
+		vcvttps2dq(ymm0, ymm0);
+
+		// c = c.upl16(c.zwxy());
+
+		vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
+		vpunpcklwd(ymm0, ymm1);
+
+		// if(!tme) c = c.srl16(7);
+
+		if(m_sel.tfx == TFX_NONE)
+		{
+			vpsrlw(ymm0, 7);
+		}
+
+		// m_local.c.rb = c.xxxx();
+		// m_local.c.ga = c.zzzz();
+
+		vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
+		vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
+
+		vmovdqa(ptr[&m_local.c.rb], ymm1);
+		vmovdqa(ptr[&m_local.c.ga], ymm2);
+	}
+}
+
+#endif
--- a/plugins/GSdx/GSVector.cpp
+++ b/plugins/GSdx/GSVector.cpp
@ -75,6 +75,7 @@ const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f80000

 #if _M_SSE >= 0x500

+const GSVector8 GSVector8::m_half(0.5f);
 const GSVector8 GSVector8::m_one(1.0f);
 const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
 const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
--- a/plugins/GSdx/GSVector.h
+++ b/plugins/GSdx/GSVector.h
@ -4119,6 +4119,15 @@ public:

 	// TODO: extract/insert

+	template<int i> __forceinline int extract32() const
+	{
+		GSVector4i v = extract<i / 4>();
+
+		if((i & 3) == 0) return GSVector4i::store(v);
+
+		return v.extract32<i>();
+	}
+
 	template<int i> __forceinline GSVector4i extract() const
 	{
 		if(i == 0) return GSVector4i(_mm256_castsi256_si128(m));
@ -4141,7 +4150,6 @@ public:
 		GSVector4i a0 = extract<0>();
 		GSVector4i a1 = extract<1>();

-
 		v0 = GSVector4i::load((int)ptr[a0.extract32<0>()]);
 		v0 = v0.insert32<1>((int)ptr[a0.extract32<1>()]);
 		v0 = v0.insert32<2>((int)ptr[a0.extract32<2>()]);
@ -4191,14 +4199,14 @@ public:
 		return cast(v0).insert<1>(v1);
 	}

-	template<> __forceinline GSVector8i gather32_32<uint32, uint8>(const uint32* ptr1, const uint8* ptr2) const
+	template<> __forceinline GSVector8i gather32_32<uint8, uint32>(const uint8* ptr1, const uint32* ptr2) const
 	{
-		return gather32_32<uint8>(ptr2).gather32_32<uint32>(ptr1);
+		return gather32_32<uint8>(ptr1).gather32_32<uint32>(ptr2);
 	}

 	template<> __forceinline GSVector8i gather32_32<uint32, uint32>(const uint32* ptr1, const uint32* ptr2) const
 	{
-		return gather32_32<uint32>(ptr2).gather32_32<uint32>(ptr1);
+		return gather32_32<uint32>(ptr1).gather32_32<uint32>(ptr2);
 	}

 	template<class T> __forceinline void gather32_32(const T* RESTRICT ptr, GSVector8i* RESTRICT dst) const
@ -4731,6 +4739,16 @@ public:
 		return GSVector8i(_mm256_broadcastq_epi64(v.m));
 	}

+	__forceinline static GSVector8i broadcast128(const GSVector4i& v)
+	{
+		// this one only has m128 source op, it will be saved to a temp on stack if the compiler is not smart enough and use the address of v directly (<= vs2012u3rc2)
+
+		return GSVector8i(_mm256_broadcastsi128_si256(v)); // fastest
+		//return GSVector8i(v); // almost as fast as broadcast
+		//return cast(v).insert<1>(v); // slow
+		//return cast(v).aa(); // slowest
+	}
+
 	__forceinline static GSVector8i zero() {return GSVector8i(_mm256_setzero_si256());}

 	__forceinline static GSVector8i xffffffff() {return zero() == zero();}
@ -4958,6 +4976,7 @@ public:
 		__m128 m0, m1;
 	};

+	static const GSVector8 m_half;
 	static const GSVector8 m_one;
 	static const GSVector8 m_x7fffffff;
 	static const GSVector8 m_x80000000;
--- a/plugins/GSdx/GSdx_vs11.vcxproj
+++ b/plugins/GSdx/GSdx_vs11.vcxproj
@ -646,6 +646,18 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|Win32'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
@ -737,6 +749,7 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX2|x64'">true</ExcludedFromBuild>
    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp" />
    <ClCompile Include="GSSetupPrimCodeGenerator.x86.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX2|Win32'">true</ExcludedFromBuild>
@ -2054,4 +2067,4 @@
      <UserProperties RESOURCE_FILE="GSdx.rc" />
    </VisualStudio>
  </ProjectExtensions>
-</Project>
+</Project>
--- a/plugins/GSdx/GSdx_vs11.vcxproj.filters
+++ b/plugins/GSdx/GSdx_vs11.vcxproj.filters
@ -336,6 +336,12 @@
    <ClCompile Include="GSRendererCS.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="GSDrawScanlineCodeGenerator.x86.avx2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="GSSetupPrimCodeGenerator.x86.avx2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="GLLoader.h">
@ -728,4 +734,4 @@
      <Filter>Resource Files</Filter>
    </ResourceCompile>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/GSdx/xbyak/xbyak_mnemonic.h
+++ b/plugins/GSdx/xbyak/xbyak_mnemonic.h
@ -789,22 +789,22 @@ void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(x
 void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, true, -1); }
 void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, true, -1); }
 void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, true, -1); }
-void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); }
-void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); }
-void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); }
-void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); }
-void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); }
-void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); }
-void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); }
-void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); }
-void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); }
-void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); }
-void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); }
-void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); }
-void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); }
-void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); }
-void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); }
-void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); }
+void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, true, -1); }
+void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, true, -1); }
+void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, true, -1); }
+void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, true, -1); }
+void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, true, -1); }
+void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, true, -1); }
+void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, true, -1); }
+void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, true, -1); }
+void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, true, -1); }
+void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, true, -1); }
+void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, true, -1); }
+void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, true, -1); }
+void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, true, -1); }
+void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, true, -1); }
+void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, true, -1); }
+void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, true, -1); }
 void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, true, -1); }
 void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, true, -1); }
 void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, true, -1); }
@ -1345,8 +1345,8 @@ void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
 void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
 void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
 void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
-void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
-void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
+void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
+void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, true); db(x4.getIdx() << 4); }
 void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
 void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
 void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
@ -1410,3 +1410,13 @@ void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1
 void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 0, 2); }
 void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 1, 0); }
 void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 1, 1); }
+
+// mods
+
+void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x78, true, 0); }
+void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x79, true, 0); }
+void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x58, true, 0); }
+void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0x59, true, 0); }
+
+// supportYMM = true
+// vpblendvb, vpsllw-vpsrlq