Compare commits

...

9 Commits

Author SHA1 Message Date
TellowKrinkle
fd145e65aa GS: Remove virtual destructor from GSAlignedClass
No point, and made it not a standard layout type
2021-11-04 19:32:27 +00:00
TellowKrinkle
6596b7f27e GS: Enable AVX2 on x64 2021-11-04 19:32:27 +00:00
TellowKrinkle
9d767838d6 GS: Remove old DrawScanline code generators 2021-11-04 19:32:27 +00:00
TellowKrinkle
f55219bb1b GS: Replace 6 DrawScanline code generators with one merged one 2021-11-04 19:32:27 +00:00
TellowKrinkle
805b647c73 GS: Remove old SetupPrim code generators 2021-11-04 19:32:27 +00:00
TellowKrinkle
fd0351ca8f GS: Replace 6 SetupPrim code generators with one merged one 2021-11-04 19:32:27 +00:00
TellowKrinkle
ed5a7802f3 Common: Add non-constant offsetof macro 2021-11-04 19:32:27 +00:00
TellowKrinkle
44f8317b7e GS: Add new code generator for easy native-isa codegen 2021-11-04 19:32:27 +00:00
TellowKrinkle
0200933ddd GS: Don't catch code generation exceptions
If codegen throws an exception, it ends up just crashing when you jump to the incompletely-generated code which is kind of useless
2021-11-04 19:32:27 +00:00
28 changed files with 4873 additions and 16873 deletions

View File

@@ -31,6 +31,10 @@
#include "common/emitter/x86_intrin.h"
// The C++ standard doesn't allow `offsetof` to be used on non-constant values (e.g. `offsetof(class, field[i])`)
// Use this in those situations
#define OFFSETOF(a, b) (reinterpret_cast<size_t>(&(static_cast<a*>(0)->b)))
// Renamed ARRAYSIZE to ArraySize -- looks nice and gets rid of Windows.h conflicts (air)
// Notes: I'd have used ARRAY_SIZE instead but ran into cross-platform lib conflicts with
// that as well. >_<

View File

@@ -639,21 +639,12 @@ set(pcsx2GSSources
GS/Renderers/HW/GSTextureCache.cpp
GS/Renderers/SW/GSDrawScanline.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
GS/Renderers/SW/GSNewCodeGenerator.cpp
GS/Renderers/SW/GSRasterizer.cpp
GS/Renderers/SW/GSRendererSW.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.avx.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.avx2.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.avx.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.avx2.cpp
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
GS/Renderers/SW/GSTextureCacheSW.cpp
GS/Renderers/SW/GSTextureSW.cpp
GS/Renderers/OpenGL/GLLoader.cpp
@@ -679,7 +670,6 @@ set(pcsx2GSHeaders
GS/GSDrawingEnvironment.h
GS/GSDump.h
GS/GS_types.h
GS/GS_codegen.h
GS/GS.h
GS/GSLocalMemory.h
GS/GSLzma.h
@@ -712,11 +702,14 @@ set(pcsx2GSHeaders
GS/Renderers/HW/GSTextureCache.h
GS/Renderers/HW/GSVertexHW.h
GS/Renderers/SW/GSDrawScanlineCodeGenerator.h
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
GS/Renderers/SW/GSDrawScanline.h
GS/Renderers/SW/GSNewCodeGenerator.h
GS/Renderers/SW/GSRasterizer.h
GS/Renderers/SW/GSRendererSW.h
GS/Renderers/SW/GSScanlineEnvironment.h
GS/Renderers/SW/GSSetupPrimCodeGenerator.h
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
GS/Renderers/SW/GSTextureCacheSW.h
GS/Renderers/SW/GSTextureSW.h
GS/Renderers/SW/GSVertexSW.h

View File

@@ -18,10 +18,11 @@
template <int i>
class GSAlignedClass
{
public:
GSAlignedClass() {}
virtual ~GSAlignedClass() {}
protected:
GSAlignedClass() = default;
~GSAlignedClass() = default;
public:
void* operator new(size_t size)
{
return _aligned_malloc(size, i);

View File

@@ -110,11 +110,7 @@ extern void vmfree(void* ptr, size_t size);
// Convert gcc see define into GS (windows) define
#if defined(__AVX2__)
#if defined(__x86_64__)
#define _M_SSE 0x500 // TODO
#else
#define _M_SSE 0x501
#endif
#define _M_SSE 0x501
#elif defined(__AVX__)
#define _M_SSE 0x500
#elif defined(__SSE4_1__)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,189 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "GSScanlineEnvironment.h"
#include "GSNewCodeGenerator.h"
#undef _t // Conflict with wx, hopefully no one needs this
#if _M_SSE >= 0x501
#define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Ymm
#define DRAW_SCANLINE_USING_XMM 0
#define DRAW_SCANLINE_USING_YMM 1
#else
#define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Xmm
#define DRAW_SCANLINE_USING_XMM 1
#define DRAW_SCANLINE_USING_YMM 0
#endif
class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
{
using _parent = GSNewCodeGenerator;
using XYm = DRAW_SCANLINE_VECTOR_REGISTER;
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
/// On x86-32 the same values are just raw 32-bit addresses
using LocalAddr = Choose3264<size_t, AddressReg>::type;
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
constexpr static int wordsize = is64 ? 8 : 4;
constexpr static int vecsize = isXmm ? 16 : 32;
constexpr static int vecsizelog = isXmm ? 4 : 5;
constexpr static int vecints = vecsize / 4;
// MARK: - Constants
constexpr static int _32_args = 16;
constexpr static int _invalid = 0xaaaaaaaa;
#ifdef _WIN32
constexpr static int _64_top = 8 * 0;
// XMM registers will be saved to `rsp + _64_win_xmm_start + id - 6`
// Which will put xmm6 after the temporaries, them xmm7, etc
constexpr static int _64_win_xmm_start = 8 * 2;
// Windows has no redzone and also has 10 xmm registers to save
constexpr static int _64_win_stack_size = _64_win_xmm_start + 16 * 10;
#else
// System-V has a redzone so stick everything there
constexpr static int _64_rz_rbx = -8 * 1;
constexpr static int _64_rz_r12 = -8 * 2;
constexpr static int _64_rz_r13 = -8 * 3;
constexpr static int _64_rz_r14 = -8 * 4;
constexpr static int _64_rz_r15 = -8 * 5;
constexpr static int _64_top = -8 * 6;
#endif
constexpr static int _top = is64 ? _64_top : _32_args + 4;
constexpr static int _v = is64 ? _invalid : _32_args + 8;
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
bool m_rip;
bool use_lod;
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
/// Note: a2 and t3 are only available on x86-64
/// Outside of Init, usable registers are a0, t0, t1, t2, t3[x64], rax, rbx, rdx, r10+
const AddressReg a0, a1, a2, a3, t0, t1, t2, t3;
const LocalAddr _g_const, _m_local, _m_local__gd, _m_local__gd__vm;
/// Available on both x86 and x64, not always valid
const XYm _rb, _ga, _fm, _zm, _fd, _test;
/// Always valid if needed, x64 only
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
/// Returns the first arg on 32-bit, second on 64-bit
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
{
return choose3264((size_t)addr32, reg64);
}
public:
GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key);
void Generate();
private:
/// Loads the given address into the given register if needed, and returns something that can be used in a `ptr[]`
LocalAddr loadAddress(AddressReg reg, const void* addr);
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast 128 bits of integers from memory to the whole register, whatever size that register might be
void broadcasti128(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a floating-point variable stored in GSScanlineLocalData to the whole register
/// On YMM registers this will be a broadcast from a 32-bit value
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
void broadcastssLocal(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a qword variable stored in GSScanlineLocalData to the whole register
/// On YMM registers this will be a broadcast from a 64-bit value
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
void pbroadcastqLocal(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a dword variable stored in GSScanlineLocalData to the whole register
/// On YMM registers this will be a broadcast from a 32-bit value
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
void pbroadcastdLocal(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a word variable stored in GSScanlineLocalData to the whole register
/// On YMM registers this will be a broadcast from a 16-bit value
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem);
/// Broadcast a 32-bit GPR to a vector register
void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr);
void modulate16(const XYm& a, const Xbyak::Operand& f, uint8 shift);
void lerp16(const XYm& a, const XYm& b, const XYm& f, uint8 shift);
void lerp16_4(const XYm& a, const XYm& b, const XYm& f);
void mix16(const XYm& a, const XYm& b, const XYm& temp);
void clamp16(const XYm& a, const XYm& temp);
void alltrue(const XYm& test);
void blend(const XYm& a, const XYm& b, const XYm& mask);
void blendr(const XYm& b, const XYm& a, const XYm& mask);
void blend8(const XYm& a, const XYm& b);
void blend8r(const XYm& b, const XYm& a);
void split16_2x8(const XYm& l, const XYm& h, const XYm& src);
void Init();
void Step();
void TestZ(const XYm& temp1, const XYm& temp2);
void SampleTexture();
void SampleTexture_TexelReadHelper(int mip_offset);
void Wrap(const XYm& uv);
void Wrap(const XYm& uv0, const XYm& uv1);
void SampleTextureLOD();
void WrapLOD(const XYm& uv);
void WrapLOD(const XYm& uv0, const XYm& uv1);
void AlphaTFX();
void ReadMask();
void TestAlpha();
void ColorTFX();
void Fog();
void ReadFrame();
void TestDestAlpha();
void WriteMask();
void WriteZBuf();
void AlphaBlend();
void WriteFrame();
void ReadPixel(const XYm& dst, const XYm& tmp, const AddressReg& addr);
#if DRAW_SCANLINE_USING_XMM
void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg8& mask, bool fast, int psm, int fz);
#else
void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz);
#endif
void WritePixel(const Xmm& src, const AddressReg& addr, uint8 i, uint8 j, int psm);
void ReadTexel1(const XYm& dst, const XYm& src, const XYm& tmp1, const XYm& tmp2, int mip_offset);
void ReadTexel4(
const XYm& d0, const XYm& d1,
const XYm& d2s0, const XYm& d3s1,
const XYm& s2, const XYm& s3,
const XYm& tmp1, const XYm& tmp2,
int mip_offset);
void ReadTexelImpl(
const XYm& d0, const XYm& d1,
const XYm& d2s0, const XYm& d3s1,
const XYm& s2, const XYm& s3,
const XYm& tmp1, const XYm& tmp2,
int pixels, int mip_offset);
void ReadTexelImplLoadTexLOD(int lod, int mip_offset);
void ReadTexelImplYmm(
const Ymm& d0, const Ymm& d1,
const Ymm& d2s0, const Ymm& d3s1,
const Ymm& s2, const Ymm& s3,
const Ymm& tmp,
int pixels, int mip_offset);
void ReadTexelImplSSE4(
const Xmm& d0, const Xmm& d1,
const Xmm& d2s0, const Xmm& d3s1,
const Xmm& s2, const Xmm& s3,
int pixels, int mip_offset);
void ReadTexelImpl(const Xmm& dst, const Xmm& addr, uint8 i, bool texInA3, bool preserveDst);
};

View File

@@ -15,17 +15,8 @@
#include "PrecompiledHeader.h"
#include "GSDrawScanlineCodeGenerator.h"
#include "GSDrawScanlineCodeGenerator.all.h"
#if _M_SSE >= 0x501
#else
void GSDrawScanlineCodeGenerator::Generate()
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
Generate_AVX();
else
Generate_SSE();
}
#endif
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
@@ -37,227 +28,5 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
if (m_sel.breakpoint)
db(0xCC);
try
{
Generate();
}
catch (std::exception& e)
{
fprintf(stderr, "ERR:GSDrawScanlineCodeGenerator %s\n", e.what());
}
}
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, uint8 shift)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
if (shift == 0)
{
vpmulhrsw(a, f);
}
else
{
vpsllw(a, shift + 1);
vpmulhw(a, f);
}
}
else
{
if (shift == 0 && m_cpu.has(Xbyak::util::Cpu::tSSSE3))
{
pmulhrsw(a, f);
}
else
{
psllw(a, shift + 1);
pmulhw(a, f);
}
}
}
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpsubw(a, b);
modulate16(a, f, shift);
vpaddw(a, b);
}
else
{
psubw(a, b);
modulate16(a, f, shift);
paddw(a, b);
}
}
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpsubw(a, b);
vpmullw(a, f);
vpsraw(a, 4);
vpaddw(a, b);
}
else
{
psubw(a, b);
pmullw(a, f);
psraw(a, 4);
paddw(a, b);
}
}
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpblendw(a, b, 0xaa);
}
else
{
pblendw(a, b, 0xaa);
}
}
void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpackuswb(a, a);
#if _M_SSE >= 0x501
// Greg: why ?
if (m_cpu.has(Xbyak::util::Cpu::tAVX2))
{
ASSERT(a.isYMM());
vpermq(Ymm(a.getIdx()), Ymm(a.getIdx()), _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
}
#endif
vpmovzxbw(a, a);
}
else
{
packuswb(a, a);
pmovzxbw(a, a);
}
}
void GSDrawScanlineCodeGenerator::alltrue(const Xmm& test)
{
uint32 mask = test.isYMM() ? 0xffffffff : 0xffff;
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpmovmskb(eax, test);
cmp(eax, mask);
je("step", T_NEAR);
}
else
{
pmovmskb(eax, test);
cmp(eax, mask);
je("step", T_NEAR);
}
}
void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpand(b, mask);
vpandn(mask, a);
vpor(a, b, mask);
}
else
{
pand(b, mask);
pandn(mask, a);
por(b, mask);
movdqa(a, b);
}
}
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpand(b, mask);
vpandn(mask, a);
vpor(b, mask);
}
else
{
pand(b, mask);
pandn(mask, a);
por(b, mask);
}
}
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
vpblendvb(a, a, b, xmm0);
else
pblendvb(a, b);
}
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
{
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
vpblendvb(b, a, b, xmm0);
}
else
{
pblendvb(a, b);
movdqa(b, a);
}
}
void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
{
// l = src & 0xFF; (1 left shift + 1 right shift)
// h = (src >> 8) & 0xFF; (1 right shift)
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
{
if (src == h)
{
vpsllw(l, src, 8);
vpsrlw(h, 8);
}
else if (src == l)
{
vpsrlw(h, src, 8);
vpsllw(l, 8);
}
else
{
vpsllw(l, src, 8);
vpsrlw(h, src, 8);
}
vpsrlw(l, 8);
}
else
{
if (src == h)
{
movdqa(l, src);
}
else if (src == l)
{
movdqa(h, src);
}
else
{
movdqa(l, src);
movdqa(h, src);
}
psllw(l, 8);
psrlw(l, 8);
psrlw(h, 8);
}
GSDrawScanlineCodeGenerator2(this, CPUInfo(m_cpu), (void*)&m_local, m_sel.key).Generate();
}

View File

@@ -27,117 +27,12 @@
class GSDrawScanlineCodeGenerator : public GSCodeGenerator
{
typedef Xbyak::Ymm Ymm;
typedef Xbyak::Xmm Xmm;
typedef Xbyak::Reg8 Reg8;
typedef Xbyak::Operand Operand;
void operator=(const GSDrawScanlineCodeGenerator&);
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
bool m_rip;
void Generate();
#if _M_SSE >= 0x501
void Init();
void Step();
void TestZ(const Ymm& temp1, const Ymm& temp2);
void SampleTexture();
void Wrap(const Ymm& uv0);
void Wrap(const Ymm& uv0, const Ymm& uv1);
void SampleTextureLOD();
void WrapLOD(const Ymm& uv0);
void WrapLOD(const Ymm& uv0, const Ymm& uv1);
void AlphaTFX();
void ReadMask();
void TestAlpha();
void ColorTFX();
void Fog();
void ReadFrame();
void TestDestAlpha();
void WriteMask();
void WriteZBuf();
void AlphaBlend();
void WriteFrame();
void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr);
void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz);
void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm);
void ReadTexel(int pixels, int mip_offset = 0);
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
#else
void Generate_SSE();
void Init_SSE();
void Step_SSE();
void TestZ_SSE(const Xmm& temp1, const Xmm& temp2);
void SampleTexture_SSE();
void Wrap_SSE(const Xmm& uv0);
void Wrap_SSE(const Xmm& uv0, const Xmm& uv1);
void SampleTextureLOD_SSE();
void WrapLOD_SSE(const Xmm& uv0);
void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1);
void AlphaTFX_SSE();
void ReadMask_SSE();
void TestAlpha_SSE();
void ColorTFX_SSE();
void Fog_SSE();
void ReadFrame_SSE();
void TestDestAlpha_SSE();
void WriteMask_SSE();
void WriteZBuf_SSE();
void AlphaBlend_SSE();
void WriteFrame_SSE();
void ReadPixel_SSE(const Xmm& dst, const RegLong& addr);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_SSE(int pixels, int mip_offset = 0);
void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i);
void Generate_AVX();
void Init_AVX();
void Step_AVX();
void TestZ_AVX(const Xmm& temp1, const Xmm& temp2);
void SampleTexture_AVX();
void Wrap_AVX(const Xmm& uv0);
void Wrap_AVX(const Xmm& uv0, const Xmm& uv1);
void SampleTextureLOD_AVX();
void WrapLOD_AVX(const Xmm& uv0);
void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1);
void AlphaTFX_AVX();
void ReadMask_AVX();
void TestAlpha_AVX();
void ColorTFX_AVX();
void Fog_AVX();
void ReadFrame_AVX();
void TestDestAlpha_AVX();
void WriteMask_AVX();
void WriteZBuf_AVX();
void AlphaBlend_AVX();
void WriteFrame_AVX();
void ReadPixel_AVX(const Xmm& dst, const RegLong& addr);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm);
void ReadTexel_AVX(int pixels, int mip_offset = 0);
void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i);
#endif
void modulate16(const Xmm& a, const Operand& f, uint8 shift);
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift);
void lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f);
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
void clamp16(const Xmm& a, const Xmm& temp);
void alltrue(const Xmm& test);
void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
void blend8(const Xmm& a, const Xmm& b);
void blend8r(const Xmm& b, const Xmm& a);
void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src);
public:
GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,118 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSDrawScanlineCodeGenerator.h"
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
// It is useless to port the code to SSEx, better use the faster 32 bits version instead
void GSDrawScanlineCodeGenerator::Generate_SSE()
{
// Avoid a crash if someone want to use it
ret();
}
void GSDrawScanlineCodeGenerator::Init_SSE()
{
}
void GSDrawScanlineCodeGenerator::Step_SSE()
{
}
void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
{
}
void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
{
}
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
{
}
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
{
}
void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
{
}
void GSDrawScanlineCodeGenerator::ReadMask_SSE()
{
}
void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
{
}
void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
{
}
void GSDrawScanlineCodeGenerator::Fog_SSE()
{
}
void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
{
}
void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
{
}
void GSDrawScanlineCodeGenerator::WriteMask_SSE()
{
}
void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
{
}
void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
{
}
void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
{
}
void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const RegLong& addr)
{
}
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz)
{
}
//static const int s_offsets[4] = {0, 2, 8, 10};
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm)
{
}
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
{
}
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
{
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -13,26 +13,5 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
using namespace Xbyak;
#ifdef _M_AMD64
// Yeah let use mips naming ;)
#ifdef _WIN64
#define a0 rcx
#define a1 rdx
#define a2 r8
#define a3 r9
#define t0 rdi
#define t1 rsi
#else
#define a0 rdi
#define a1 rsi
#define a2 rdx
#define a3 rcx
#define t0 r8
#define t1 r9
#endif
#endif
#include "PrecompiledHeader.h"
#include "GSNewCodeGenerator.h"

View File

@@ -0,0 +1,489 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "GS/GS_types.h"
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
namespace SSEVersion
{
enum SSEVersion
{
AVX2 = 0x501,
AVX = 0x500,
SSE41 = 0x401,
};
}
/// Similar to Xbyak::util::cpu but more open to us putting in extra flags (e.g. "vpgatherdd is fast"), as well as making it easier to test other configurations by artifically limiting features
struct CPUInfo
{
bool hasFMA = false;
SSEVersion::SSEVersion sseVersion = SSEVersion::SSE41;
CPUInfo() = default;
CPUInfo(const Xbyak::util::Cpu& cpu)
{
auto version = SSEVersion::SSE41;
if (cpu.has(cpu.tAVX))
version = SSEVersion::AVX;
if (cpu.has(cpu.tAVX2))
version = SSEVersion::AVX2;
hasFMA = cpu.has(cpu.tFMA);
sseVersion = version;
}
};
/// Code generator that automatically selects between SSE and AVX, x86 and x64 so you don't have to
/// Should make combined SSE and AVX codegen much easier
class GSNewCodeGenerator
{
public:
using Address = Xbyak::Address;
using Label = Xbyak::Label;
using Operand = Xbyak::Operand;
using Reg32e = Xbyak::Reg32e;
using Reg32 = Xbyak::Reg32;
using Reg16 = Xbyak::Reg16;
using Reg8 = Xbyak::Reg8;
using Reg = Xbyak::Reg;
using Xmm = Xbyak::Xmm;
using Ymm = Xbyak::Ymm;
using Zmm = Xbyak::Zmm;
class Error : public std::exception
{
public:
enum Value
{
ERR_64_BIT_REG_IN_32,
ERR_64_INSTR_IN_32,
ERR_SSE_INSTR_IN_AVX,
ERR_AVX_INSTR_IN_SSE,
};
Value value;
Error(Value value) : value(value) {}
const char* what() const noexcept
{
static const char* tbl[] = {
"used 64-bit register in 32-bit code",
"used 64-bit only instruction in 32-bit code",
"used SSE instruction in AVX code",
"used AVX instruction in SSE code",
};
if (static_cast<uint32>(value) < (sizeof(tbl) / sizeof(*tbl)))
{
return tbl[value];
}
else
{
return "GSNewCodeGenerator Unknown Error";
}
}
};
private:
/// Make sure the register is okay to use
void validateRegister(const Operand& op)
{
if (is64)
return;
if (op.isREG() && (op.isExtIdx() || op.isExt8bit()))
throw Error(Error::ERR_64_BIT_REG_IN_32);
if (op.isMEM())
{
auto e = static_cast<const Address&>(op).getRegExp();
validateRegister(e.getIndex());
validateRegister(e.getBase());
}
}
/// For easier macro-ing
void validateRegister(int imm)
{
}
void require64()
{
if (!is64)
throw Error(Error::ERR_64_INSTR_IN_32);
}
void requireAVX()
{
if (!hasAVX)
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
}
public:
Xbyak::CodeGenerator& actual;
#if defined(_M_X86_64)
constexpr static bool is32 = false;
constexpr static bool is64 = true;
using AddressReg = Xbyak::Reg64;
using RipType = Xbyak::RegRip;
template <typename T32, typename T64>
struct Choose3264 { using type = T64; };
template <typename T32, typename T64>
static T64 choose3264(T32 t32, T64 t64) { return t64; }
#else
constexpr static bool is32 = true;
constexpr static bool is64 = false;
using AddressReg = Xbyak::Reg32;
using RipType = int;
template <typename T32, typename T64>
struct Choose3264 { using type = T32; };
template <typename T32, typename T64>
static T32 choose3264(T32 t32, T64 t64) { return t32; }
#endif
const bool hasAVX, hasAVX2, hasFMA;
const Xmm xmm0{0}, xmm1{1}, xmm2{2}, xmm3{3}, xmm4{4}, xmm5{5}, xmm6{6}, xmm7{7}, xmm8{8}, xmm9{9}, xmm10{10}, xmm11{11}, xmm12{12}, xmm13{13}, xmm14{14}, xmm15{15};
const Ymm ymm0{0}, ymm1{1}, ymm2{2}, ymm3{3}, ymm4{4}, ymm5{5}, ymm6{6}, ymm7{7}, ymm8{8}, ymm9{9}, ymm10{10}, ymm11{11}, ymm12{12}, ymm13{13}, ymm14{14}, ymm15{15};
const AddressReg rax{0}, rcx{1}, rdx{2}, rbx{3}, rsp{4}, rbp{5}, rsi{6}, rdi{7}, r8{8}, r9{9}, r10{10}, r11{11}, r12{12}, r13{13}, r14{14}, r15{15};
const Reg32 eax{0}, ecx{1}, edx{2}, ebx{3}, esp{4}, ebp{5}, esi{6}, edi{7}, r8d{8}, r9d{9}, r10d{10}, r11d{11}, r12d{12}, r13d{13}, r14d{14}, r15d{15};
const Reg16 ax{0}, cx{1}, dx{2}, bx{3}, sp{4}, bp{5}, si{6}, di{7};
const Reg8 al{0}, cl{1}, dl{2}, bl{3}, ah{4}, ch{5}, dh{6}, bh{7};
const RipType rip{};
const Xbyak::AddressFrame ptr{0}, byte{8}, word{16}, dword{32}, qword{64}, xword{128}, yword{256}, zword{512};
GSNewCodeGenerator(Xbyak::CodeGenerator* actual, CPUInfo cpu)
: actual(*actual)
, hasAVX(cpu.sseVersion >= SSEVersion::AVX)
, hasAVX2(cpu.sseVersion >= SSEVersion::AVX2)
, hasFMA(cpu.hasFMA)
{
}
// ------------ Forwarding instructions ------------
// Note: Only instructions used by codegen were added here, so if you're modifying codegen, you may need to add instructions here
// For instructions available in SSE and AVX, functions with the SSE name and arguments that forward to SSE or AVX depending on the target, as well as functions with the AVX name and arguments that forward to the AVX version or assert on SSE
// ARGS_* macros are provided for shorter argument lists. The following single-letter abbreviations are used: X=Xmm, Y=Ymm, O=Operand, A=Address, I=Immediate
// FORWARD(argcount, category, instrname, argtypes...) forwards an instruction. The following categories are available:
// BASE: non-SSE
// SSE: available on SSE and v-prefixed on AVX
// SSEONLY: available only on SSE (exception on AVX)
// AVX: available only on AVX (exception on SSE)
// AVX2: available only on AVX2 (exception on AVX/SSE)
// FMA: available only with FMA
// SFORWARD forwards an SSE-AVX pair where the AVX variant takes the same number of registers (e.g. pshufd dst, src + vpshufd dst, src)
// AFORWARD forwards an SSE-AVX pair where the AVX variant takes an extra destination register (e.g. shufps dst, src + vshufps dst, src, src)
// Implementation details:
// ACTUAL_FORWARD_*: Actually forward the function of the given type
// FORWARD#: First validates the arguments (e.g. make sure you're not passing registers over 7 on x86), then forwards to an ACTUAL_FORWARD_*
// Big thanks to https://stackoverflow.com/a/24028231 for helping me figure out how to work around MSVC's terrible macro expander
// Of course GCC/Clang don't like the workaround so enjoy the ifdefs
#define EXPAND_ARGS(macro, args) macro args
#define ACTUAL_FORWARD_BASE(name, ...) \
actual.name(__VA_ARGS__);
#define ACTUAL_FORWARD_SSE(name, ...) \
if (hasAVX) \
actual.v##name(__VA_ARGS__); \
else \
actual.name(__VA_ARGS__);
#define ACTUAL_FORWARD_SSEONLY(name, ...) \
if (hasAVX) \
throw Error(Error::ERR_SSE_INSTR_IN_AVX); \
else \
actual.name(__VA_ARGS__);
#define ACTUAL_FORWARD_AVX(name, ...) \
if (hasAVX) \
actual.name(__VA_ARGS__); \
else \
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
#define ACTUAL_FORWARD_AVX2(name, ...) \
if (hasAVX2) \
actual.name(__VA_ARGS__); \
else \
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
#define ACTUAL_FORWARD_FMA(name, ...) \
if (hasFMA) \
actual.name(__VA_ARGS__); \
else \
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
#define FORWARD1(category, name, type) \
void name(type a) \
{ \
validateRegister(a); \
ACTUAL_FORWARD_##category(name, a) \
}
#define FORWARD2(category, name, type1, type2) \
void name(type1 a, type2 b) \
{ \
validateRegister(a); \
validateRegister(b); \
ACTUAL_FORWARD_##category(name, a, b) \
}
#define FORWARD3(category, name, type1, type2, type3) \
void name(type1 a, type2 b, type3 c) \
{ \
validateRegister(a); \
validateRegister(b); \
validateRegister(c); \
ACTUAL_FORWARD_##category(name, a, b, c) \
}
#define FORWARD4(category, name, type1, type2, type3, type4) \
void name(type1 a, type2 b, type3 c, type4 d) \
{ \
validateRegister(a); \
validateRegister(b); \
validateRegister(c); \
validateRegister(d); \
ACTUAL_FORWARD_##category(name, a, b, c, d) \
}
#ifdef __GNUC__
#define FORWARD_(argcount, ...) FORWARD##argcount(__VA_ARGS__)
// Gets the macro evaluator to evaluate in the right order
#define FORWARD(...) FORWARD_(__VA_ARGS__)
#else
#define FORWARD_(argcount, ...) EXPAND_ARGS(FORWARD##argcount, (__VA_ARGS__))
// Gets the macro evaluator to evaluate in the right order
#define FORWARD(...) EXPAND_ARGS(FORWARD_, (__VA_ARGS__))
#endif
#define FORWARD_SSE_XMM0(name) \
void name(const Xmm& a, const Operand& b) \
{ \
validateRegister(a); \
validateRegister(b); \
if (hasAVX) \
actual.v##name(a, b, Xmm(0)); \
else \
actual.name(a, b); \
} \
FORWARD(4, AVX, v##name, const Xmm&, const Xmm&, const Operand&, const Xmm&)
#define FORWARD_JUMP(name) \
void name(const void *addr) { actual.name(addr); } \
void name(const Label& label, Xbyak::CodeGenerator::LabelType type = Xbyak::CodeGenerator::T_AUTO) { actual.name(label, type); } \
void name(const char *label, Xbyak::CodeGenerator::LabelType type = Xbyak::CodeGenerator::T_AUTO) { actual.name(label, type); }
#define ADD_ONE_2 3
#define ADD_ONE_3 4
#ifdef __GNUC__
#define SFORWARD(argcount, name, ...) FORWARD(argcount, SSE, name, __VA_ARGS__)
#define AFORWARD_(argcount, name, arg1, ...) \
SFORWARD(argcount, name, arg1, __VA_ARGS__) \
FORWARD(ADD_ONE_##argcount, AVX, v##name, arg1, arg1, __VA_ARGS__)
// Gets the macro evaluator to evaluate in the right order
#define AFORWARD(...) EXPAND_ARGS(AFORWARD_, (__VA_ARGS__))
#else
#define SFORWARD(argcount, name, ...) EXPAND_ARGS(FORWARD, (argcount, SSE, name, __VA_ARGS__))
#define AFORWARD_(argcount, name, arg1, ...) \
EXPAND_ARGS(SFORWARD, (argcount, name, arg1, __VA_ARGS__)) \
EXPAND_ARGS(FORWARD, (ADD_ONE_##argcount, AVX, v##name, arg1, arg1, __VA_ARGS__))
// Gets the macro evaluator to evaluate in the right order
#define AFORWARD(...) EXPAND_ARGS(AFORWARD_, (__VA_ARGS__))
#endif
#define FORWARD_OO_OI(name) \
FORWARD(2, BASE, name, ARGS_OO) \
FORWARD(2, BASE, name, ARGS_OI)
#define ARGS_OI const Operand&, uint32
#define ARGS_OO const Operand&, const Operand&
#define ARGS_XI const Xmm&, int
#define ARGS_XO const Xmm&, const Operand&
#define ARGS_XOI const Xmm&, const Operand&, uint8
#define ARGS_XXO const Xmm&, const Xmm&, const Operand&
// For instructions that are ifdef'd out without XBYAK64
#ifdef XBYAK64
#define REQUIRE64(action) require64(); action
#else
#define REQUIRE64(action) require64()
#endif
const uint8 *getCurr() { return actual.getCurr(); }
void align(int x = 16) { return actual.align(x); }
void db(int code) { actual.db(code); }
void L(const std::string& label) { actual.L(label); }
void cdqe() { REQUIRE64(actual.cdqe()); }
void ret(int imm = 0) { actual.ret(imm); }
void vzeroupper() { requireAVX(); actual.vzeroupper(); }
void vzeroall() { requireAVX(); actual.vzeroall(); }
FORWARD_OO_OI(add)
FORWARD_OO_OI(and)
FORWARD_OO_OI(cmp)
FORWARD_OO_OI(or)
FORWARD_OO_OI(sub)
FORWARD_OO_OI(xor)
FORWARD(2, BASE, lea, const Reg&, const Address&)
FORWARD(2, BASE, mov, const Operand&, size_t)
FORWARD(2, BASE, mov, ARGS_OO)
FORWARD(2, BASE, movzx, const Reg&, const Operand&)
FORWARD(1, BASE, not, const Operand&)
FORWARD(1, BASE, pop, const Operand&)
FORWARD(1, BASE, push, const Operand&)
FORWARD(2, BASE, sar, const Operand&, const Reg8&)
FORWARD(2, BASE, sar, ARGS_OI)
FORWARD(2, BASE, shl, const Operand&, const Reg8&)
FORWARD(2, BASE, shl, ARGS_OI)
FORWARD(2, BASE, shr, const Operand&, const Reg8&)
FORWARD(2, BASE, shr, ARGS_OI)
FORWARD(2, BASE, test, const Operand&, const Reg&);
FORWARD(2, BASE, test, ARGS_OI);
FORWARD_JUMP(je)
FORWARD_JUMP(jle)
FORWARD_JUMP(jmp)
AFORWARD(2, addps, ARGS_XO)
SFORWARD(2, cvtdq2ps, ARGS_XO)
SFORWARD(2, cvtps2dq, ARGS_XO)
SFORWARD(2, cvttps2dq, ARGS_XO)
SFORWARD(3, extractps, const Operand&, const Xmm&, uint8)
AFORWARD(2, maxps, ARGS_XO)
AFORWARD(2, minps, ARGS_XO)
SFORWARD(2, movaps, ARGS_XO)
SFORWARD(2, movaps, const Address&, const Xmm&)
SFORWARD(2, movd, const Address&, const Xmm&)
SFORWARD(2, movd, const Reg32&, const Xmm&)
SFORWARD(2, movd, const Xmm&, const Address&)
SFORWARD(2, movd, const Xmm&, const Reg32&)
SFORWARD(2, movdqa, ARGS_XO)
SFORWARD(2, movdqa, const Address&, const Xmm&)
SFORWARD(2, movhps, ARGS_XO)
SFORWARD(2, movhps, const Address&, const Xmm&)
SFORWARD(2, movq, const Address&, const Xmm&)
SFORWARD(2, movq, const Xmm&, const Address&)
AFORWARD(2, mulps, ARGS_XO)
AFORWARD(2, orps, ARGS_XO)
AFORWARD(2, packssdw, ARGS_XO)
AFORWARD(2, packusdw, ARGS_XO)
AFORWARD(2, packuswb, ARGS_XO)
AFORWARD(2, paddd, ARGS_XO)
AFORWARD(2, paddusb, ARGS_XO)
AFORWARD(2, paddw, ARGS_XO)
AFORWARD(2, pand, ARGS_XO)
AFORWARD(2, pandn, ARGS_XO)
AFORWARD(3, pblendw, ARGS_XOI)
AFORWARD(2, pcmpeqd, ARGS_XO)
AFORWARD(2, pcmpeqw, ARGS_XO)
AFORWARD(2, pcmpgtd, ARGS_XO)
SFORWARD(3, pextrd, const Operand&, const Xmm&, uint8)
SFORWARD(3, pextrw, const Operand&, const Xmm&, uint8)
AFORWARD(3, pinsrd, ARGS_XOI)
AFORWARD(2, pmaxsw, ARGS_XO)
AFORWARD(2, pminsd, ARGS_XO)
AFORWARD(2, pminsw, ARGS_XO)
SFORWARD(2, pmovsxbd, ARGS_XO)
SFORWARD(2, pmovmskb, const Reg32e&, const Xmm&)
SFORWARD(2, pmovzxbw, ARGS_XO)
AFORWARD(2, pmulhrsw, ARGS_XO)
AFORWARD(2, pmulhw, ARGS_XO)
AFORWARD(2, pmullw, ARGS_XO)
AFORWARD(2, por, ARGS_XO)
SFORWARD(3, pshufd, ARGS_XOI)
SFORWARD(3, pshufhw, ARGS_XOI)
SFORWARD(3, pshuflw, ARGS_XOI)
AFORWARD(2, pslld, ARGS_XI)
AFORWARD(2, psllw, ARGS_XI)
AFORWARD(2, psrad, ARGS_XI)
AFORWARD(2, psrad, ARGS_XO)
AFORWARD(2, psraw, ARGS_XI)
AFORWARD(2, psrld, ARGS_XI)
AFORWARD(2, psrldq, ARGS_XI)
AFORWARD(2, psrlw, ARGS_XI)
AFORWARD(2, psrlw, ARGS_XO)
AFORWARD(2, psubd, ARGS_XO)
AFORWARD(2, psubw, ARGS_XO)
AFORWARD(2, punpckhdq, ARGS_XO)
AFORWARD(2, punpckhwd, ARGS_XO)
AFORWARD(2, punpcklbw, ARGS_XO)
AFORWARD(2, punpckldq, ARGS_XO)
AFORWARD(2, punpcklqdq,ARGS_XO)
AFORWARD(2, punpcklwd, ARGS_XO)
AFORWARD(2, pxor, ARGS_XO)
SFORWARD(2, rcpps, ARGS_XO)
AFORWARD(3, shufps, ARGS_XOI)
AFORWARD(2, subps, ARGS_XO)
AFORWARD(2, xorps, ARGS_XO)
FORWARD_SSE_XMM0(pblendvb)
FORWARD(2, AVX, vbroadcastss, ARGS_XO)
FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&)
FORWARD(2, AVX, vbroadcastf128, const Ymm&, const Address&)
FORWARD(3, FMA, vfmadd213ps, ARGS_XXO)
FORWARD(3, AVX2, vextracti128, const Operand&, const Ymm&, uint8)
FORWARD(4, AVX2, vinserti128, const Ymm&, const Ymm&, const Operand&, uint8);
FORWARD(2, AVX2, vpbroadcastd, ARGS_XO)
FORWARD(2, AVX2, vpbroadcastq, ARGS_XO)
FORWARD(2, AVX2, vpbroadcastw, ARGS_XO)
FORWARD(3, AVX2, vpermq, const Ymm&, const Operand&, uint8)
FORWARD(3, AVX2, vpgatherdd, const Xmm&, const Address&, const Xmm&);
FORWARD(3, AVX2, vpsravd, ARGS_XXO)
FORWARD(3, AVX2, vpsrlvd, ARGS_XXO)
#undef REQUIRE64
#undef ARGS_OI
#undef ARGS_OO
#undef ARGS_XI
#undef ARGS_XO
#undef ARGS_XOI
#undef ARGS_XXO
#undef FORWARD_OO_OI
#undef AFORWARD
#undef AFORWARD_
#undef SFORWARD
#undef ADD_ONE_2
#undef ADD_ONE_3
#undef FORWARD_SSE_XMM0
#undef FORWARD_JUMP
#undef FORWARD
#undef FORWARD_
#undef FORWARD4
#undef FORWARD3
#undef FORWARD2
#undef FORWARD1
#undef ACTUAL_FORWARD_FMA
#undef ACTUAL_FORWARD_AVX2
#undef ACTUAL_FORWARD_AVX
#undef ACTUAL_FORWARD_SSE
#undef ACTUAL_FORWARD_SSEONLY
#undef ACTUAL_FORWARD_BASE
#undef EXPAND_ARGS
};

View File

@@ -0,0 +1,566 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GS/GS_types.h"
#include "GSSetupPrimCodeGenerator.all.h"
#include "GSVertexSW.h"
using namespace Xbyak;
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
#define _64_m_local _64_t0
/// On AVX, does a v-prefixed separate destination operation
/// On SSE, moves src1 into dst using movdqa, then does the operation
#define THREEARG(operation, dst, src1, ...) \
do \
{ \
if (hasAVX) \
{ \
v##operation(dst, src1, __VA_ARGS__); \
} \
else \
{ \
movdqa(dst, src1); \
operation(dst, __VA_ARGS__); \
} \
} while (0)
#if _M_SSE >= 0x501
#define _rip_local_d(x) _rip_local(d8.x)
#define _rip_local_d_p(x) _rip_local_d(p.x)
#else
#define _rip_local_d(x) _rip_local(d4.x)
#define _rip_local_d_p(x) _rip_local_d(x)
#endif
GSSetupPrimCodeGenerator2::GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key)
: _parent(base, cpu)
, m_local(*(GSScanlineLocalData*)param)
, m_rip(false), many_regs(false)
// On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap
#ifdef _WIN32
, _64_vertex(is64 ? rcx : r8)
, _index(is64 ? rdx : rcx)
, _dscan(is64 ? r8 : rdx)
, _64_t0(r9), t1(is64 ? r10 : rcx)
#else
, _64_vertex(is64 ? rdi : r8)
, _index(is64 ? rsi : rcx)
, _dscan(rdx)
, _64_t0(is64 ? rcx : r8), t1(is64 ? r8 : rcx)
#endif
, _m_local(chooseLocal(&m_local, _64_m_local))
{
m_sel.key = key;
m_en.z = m_sel.zb ? 1 : 0;
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
}
void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem)
{
#if SETUP_PRIM_USING_YMM
vbroadcastf128(reg, mem);
#else
movaps(reg, mem);
#endif
}
void GSSetupPrimCodeGenerator2::Generate()
{
// Technically we just need the delta < 2GB
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip;
many_regs = is64 && isYmm && !m_sel.notest && needs_shift;
#ifdef _WIN64
int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
if (needs_saving)
{
sub(rsp, 8 + 16 * needs_saving);
for (int i = 0; i < needs_saving; i++)
{
movdqa(ptr[rsp + i * 16], Xmm(i + 6));
}
}
#endif
if (is64 && !m_rip)
mov(_64_m_local, (size_t)&m_local);
if (needs_shift)
{
if (is32)
mov(_dscan, ptr[rsp + _32_dscan]);
if (isXmm)
mov(rax, (size_t)g_const->m_shift_128b);
else
mov(rax, (size_t)g_const->m_shift_256b);
for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
{
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
}
}
if (isXmm)
Depth_XMM();
else
Depth_YMM();
Texture();
Color();
#ifdef _WIN64
if (needs_saving)
{
for (int i = 0; i < needs_saving; i++)
{
movdqa(Xmm(i + 6), ptr[rsp + i * 16]);
}
add(rsp, 8 + 16 * needs_saving);
}
#endif
if (isYmm)
vzeroupper();
ret();
}
void GSSetupPrimCodeGenerator2::Depth_XMM()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// GSVector4 df = p.wwww();
THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
THREEARG(mulps, xmm2, xmm1, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(_rip_local_d_p(f), xmm2);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
THREEARG(mulps, xmm2, xmm1, XYm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(_rip_local(d[i].f), xmm2);
}
}
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
THREEARG(mulps, xmm1, xmm0, xmm3);
movdqa(_rip_local_d_p(z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
movdqa(_rip_local(d[i].z), xmm1);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(uint32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
if (m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(_rip_local(p.f), xmm1);
}
if (m_en.z)
{
// uint32 z is bypassed in t.w
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
movdqa(_rip_local(p.z), xmm0);
}
}
}
void GSSetupPrimCodeGenerator2::Depth_YMM()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
vmulps(ymm1, ymm0, ymm3);
if (m_en.z)
{
// m_local.d8.p.z = dp8.extract32<2>();
extractps(_rip_local_d_p(z), xmm1, 2);
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
}
if (m_en.f)
{
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
cvtps2dq(ymm1, ymm1);
pextrd(_rip_local_d_p(f), xmm1, 3);
// GSVector8 df = GSVector8(dscan.p).wwww();
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
}
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
if (m_en.z)
{
// m_local.d[i].z = dz * shift[1 + i];
// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
if (i < 4 || many_regs)
vmulps(ymm0, Ymm(4 + i), ymm2);
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
movaps(_rip_local(d[i].z), ymm0);
}
if (m_en.f)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
if (i < 4 || many_regs)
vmulps(ymm0, Ymm(4 + i), ymm1);
else
vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
cvttps2dq(ymm0, ymm0);
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(_rip_local(d[i].f), ymm0);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(uint32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
if (m_en.f)
{
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
cvttps2dq(xmm0, xmm0);
pextrd(_rip_local(p.f), xmm0, 3);
}
if (m_en.z)
{
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), t1.cvt32());
}
}
}
void GSSetupPrimCodeGenerator2::Texture()
{
if (!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, t)]);
THREEARG(mulps, xmm1, xmm0, xmm3);
if (m_sel.fst)
{
// m_local.d4.stq = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(_rip_local_d(stq), xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
movaps(_rip_local_d(stq), xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
THREEARG(shufps, xym1, xym0, xym0, _MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
if (i < 4 || many_regs)
THREEARG(mulps, xym2, XYm(4 + i), xym1);
else
vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector4i(v);
cvttps2dq(xym2, xym2);
switch (j)
{
case 0: movdqa(_rip_local(d[i].s), xym2); break;
case 1: movdqa(_rip_local(d[i].t), xym2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch (j)
{
case 0: movaps(_rip_local(d[i].s), xym2); break;
case 1: movaps(_rip_local(d[i].t), xym2); break;
case 2: movaps(_rip_local(d[i].q), xym2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator2::Color()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector4 c = dscan.c;
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
THREEARG(mulps, xmm1, xmm0, xmm3);
cvttps2dq(xmm1, xmm1);
pshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm1, xmm1);
if (isXmm)
movdqa(_rip_local_d(c), xmm1);
else
movq(_rip_local_d(c), xmm1);
// xym3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
THREEARG(shufps, xym2, xym0, xym0, _MM_SHUFFLE(0, 0, 0, 0));
THREEARG(shufps, xym3, xym0, xym0, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
if (i < 4 || many_regs)
THREEARG(mulps, xym0, XYm(4 + i), xym2);
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
cvttps2dq(xym0, xym0);
packssdw(xym0, xym0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
if (i < 4 || many_regs)
THREEARG(mulps, xym1, XYm(4 + i), xym3);
else
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
cvttps2dq(xym1, xym1);
packssdw(xym1, xym1);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xym0, xym1);
movdqa(_rip_local(d[i].rb), xym0);
}
// GSVector4 c = dscan.c;
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
THREEARG(shufps, xym2, xym0, xym0, _MM_SHUFFLE(1, 1, 1, 1));
THREEARG(shufps, xym3, xym0, xym0, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
if (i < 4 || many_regs)
THREEARG(mulps, xym0, XYm(4 + i), xym2);
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
cvttps2dq(xym0, xym0);
packssdw(xym0, xym0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
if (i < 4 || many_regs)
THREEARG(mulps, xym1, XYm(4 + i), xym3);
else
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
cvttps2dq(xym1, xym1);
packssdw(xym1, xym1);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xym0, xym1);
movdqa(_rip_local(d[i].ga), xym0);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
if (is32)
mov(_index, ptr[rsp + _32_index]);
mov(eax, ptr[_index + sizeof(uint32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW)
if (is64)
add(rax, _64_vertex);
else
add(rax, ptr[rsp + _32_vertex]);
}
if (isXmm)
{
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
}
else
{
vbroadcasti128(ymm0, ptr[rax + offsetof(GSVertexSW, c)]);
cvttps2dq(ymm0, ymm0);
}
// c = c.upl16(c.zwxy());
pshufd(xym1, xym0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xym0, xym1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
psrlw(xym0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xym1, xym0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xym2, xym0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(_rip_local(c.rb), xym1);
movdqa(_rip_local(c.ga), xym2);
}
}

View File

@@ -0,0 +1,83 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "GSScanlineEnvironment.h"
#include "GSNewCodeGenerator.h"
#if _M_SSE >= 0x501
#define SETUP_PRIM_VECTOR_REGISTER Xbyak::Ymm
#define SETUP_PRIM_USING_XMM 0
#define SETUP_PRIM_USING_YMM 1
#else
#define SETUP_PRIM_VECTOR_REGISTER Xbyak::Xmm
#define SETUP_PRIM_USING_XMM 1
#define SETUP_PRIM_USING_YMM 0
#endif
class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator
{
using _parent = GSNewCodeGenerator;
using XYm = SETUP_PRIM_VECTOR_REGISTER;
using Xmm = Xbyak::Xmm;
using Ymm = Xbyak::Ymm;
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
/// On x86-32 the same values are just raw 32-bit addresses
using LocalAddr = Choose3264<size_t, AddressReg>::type;
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
constexpr static int vecsize = isXmm ? 16 : 32;
constexpr static int dsize = isXmm ? 4 : 8;
constexpr static int _32_args = 0;
constexpr static int _invalid = 0xaaaaaaaa;
constexpr static int _32_vertex = is64 ? _invalid : _32_args + 4;
constexpr static int _32_index = is64 ? _invalid : _32_args + 8;
constexpr static int _32_dscan = is64 ? _invalid : _32_args + 12;
GSScanlineSelector m_sel;
GSScanlineLocalData& m_local;
bool m_rip;
bool many_regs;
struct {uint32 z:1, f:1, t:1, c:1;} m_en;
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
const AddressReg _64_vertex, _index, _dscan, _64_t0, t1;
const LocalAddr _m_local;
/// Returns the first arg on 32-bit, second on 64-bit
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
{
return choose3264((size_t)addr32, reg64);
}
public:
GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key);
void Generate();
private:
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
void Depth_XMM();
void Depth_YMM();
void Texture();
void Color();
};

View File

@@ -15,6 +15,7 @@
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSSetupPrimCodeGenerator.all.h"
using namespace Xbyak;
@@ -30,19 +31,5 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
try
{
#if _M_SSE >= 0x501
Generate_AVX2();
#else
if (m_cpu.has(util::Cpu::tAVX))
Generate_AVX();
else
Generate_SSE();
#endif
}
catch (std::exception& e)
{
fprintf(stderr, "ERR:GSSetupPrimCodeGenerator %s\n", e.what());
}
GSSetupPrimCodeGenerator2(this, CPUInfo(m_cpu), param, key).Generate();
}

View File

@@ -32,23 +32,6 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
uint32 z : 1, f : 1, t : 1, c : 1;
} m_en;
#if _M_SSE < 0x501
void Generate_SSE();
void Depth_SSE();
void Texture_SSE();
void Color_SSE();
void Generate_AVX();
void Depth_AVX();
void Texture_AVX();
void Color_AVX();
#else
void Generate_AVX2();
void Depth_AVX2();
void Texture_AVX2();
void Color_AVX2();
#endif
public:
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
};

View File

@@ -1,365 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#include "GS/GS_codegen.h"
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
void GSSetupPrimCodeGenerator::Generate_AVX()
{
// Technically we just need the delta < 2GB
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
#ifdef _WIN64
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
#endif
if (!m_rip)
mov(t0, (size_t)&m_local);
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)g_const->m_shift_128b);
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
}
}
Depth_AVX();
Texture_AVX();
Color_AVX();
#ifdef _WIN64
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
#endif
ret();
}
void GSSetupPrimCodeGenerator::Depth_AVX()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// GSVector4 df = p.wwww();
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(_rip_local(d4.f), xmm2);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].f, variableOffset), xmm2);
}
}
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(_rip_local(d4.z), xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].z, variableOffset), xmm1);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
if (m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(_rip_local(p.f), xmm1);
}
if (m_en.z)
{
// uint32 z is bypassed in t.w
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
vmovdqa(_rip_local(p.z), xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture_AVX()
{
if (!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
vmulps(xmm1, xmm0, xmm3);
if (m_sel.fst)
{
// m_local.d4.stq = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(_rip_local(d4.stq), xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
vmovaps(_rip_local(d4.stq), xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
switch (j)
{
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
switch (j)
{
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_AVX()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(_rip_local(d4.c), xmm1);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].rb, variableOffset), xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].ga, variableOffset), xmm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(eax, ptr[a1 + sizeof(uint32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
}
vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
vpsrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(_rip_local(c.rb), xmm1);
vmovdqa(_rip_local(c.ga), xmm2);
}
}
#endif

View File

@@ -1,368 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#include "GS/GS_codegen.h"
#if _M_SSE >= 0x501 && (defined(_M_AMD64) || defined(_WIN64))
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
#define _m_shift(i) (Ymm(7 + i))
// FIXME windows ?
#define _vertex rcx
void GSSetupPrimCodeGenerator::Generate_AVX2()
{
// Technically we just need the delta < 2GB
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
#ifdef _WIN64
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], ymm6);
vmovdqa(ptr[rsp + 16], ymm7);
#endif
if (!m_rip)
mov(t0, (size_t)&m_local);
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)g_const->m_shift_256b);
for (int i = 0; i < (m_sel.notest ? 2 : 9); i++)
{
vmovaps(_m_shift(i), ptr[rax + i * 32]);
}
}
// ymm7 to ymm 15 = m_shift[i]
Depth_AVX2();
Texture_AVX2();
Color_AVX2();
#ifdef _WIN64
vmovdqa(ymm6, ptr[rsp + 0]);
vmovdqa(ymm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
#endif
ret();
}
void GSSetupPrimCodeGenerator::Depth_AVX2()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
const Ymm& dscan_p = ymm6;
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
vbroadcastf128(dscan_p, ptr[a2 + offsetof(GSVertexSW, p)]);
vmulps(ymm1, dscan_p, _m_shift(0));
if (m_en.z)
{
// m_local.d8.p.z = dp8.extract32<2>();
vextractps(_rip_local(d8.p.z), xmm1, 2);
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vshufps(ymm2, dscan_p, dscan_p, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// m_local.d[i].z = dz * shift[1 + i];
vmulps(ymm0, ymm2, _m_shift(1 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovaps(_rip_local_v(d[i].z, variableOffset), ymm0);
}
}
if (m_en.f)
{
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
// FIXME no truncate ? why ? vcvttps2dq ?
//vcvtps2dq(ymm2, ymm1); // let's guess a typo
vcvttps2dq(ymm2, ymm1);
vpextrd(_rip_local(d8.p.f), xmm2, 3);
// GSVector8 df = GSVector8(dscan.p).wwww();
vshufps(ymm3, dscan_p, dscan_p, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
vmulps(ymm0, ymm3, _m_shift(1 + i));
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].f, variableOffset), ymm0);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * 1]);
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
add(_vertex, a0);
if (m_en.f)
{
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
vmovaps(xmm0, ptr[_vertex + offsetof(GSVertexSW, p)]);
vcvttps2dq(xmm0, xmm0);
vpextrd(_rip_local(p.f), xmm0, 3);
}
if (m_en.z)
{
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
mov(_rip_local(p.z), eax);
}
}
}
void GSSetupPrimCodeGenerator::Texture_AVX2()
{
if (!m_en.t)
{
return;
}
// GSVector8 dt(dscan.t);
vbroadcastf128(ymm0, ptr[a2 + offsetof(GSVertexSW, t)]);
// GSVector8 dt8 = dt * shift[0];
vmulps(ymm1, ymm0, _m_shift(0));
if (m_sel.fst)
{
// m_local.84.stq = GSVector4i(t * 4.0f);
vcvttps2dq(ymm1, ymm1);
vmovdqa(_rip_local(d8.stq), xmm1);
}
else
{
// m_local.d8.stq = t * 4.0f;
vmovaps(_rip_local(d8.stq), xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8 v = dstq * shift[1 + i];
vmulps(ymm2, ymm1, _m_shift(1 + i));
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
vcvttps2dq(ymm2, ymm2);
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
switch (j)
{
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
switch (j)
{
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), ymm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_AVX2()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
const Ymm& dscan_c = ymm6;
// GSVector8 dc(dscan.c);
vbroadcastf128(dscan_c, ptr[a2 + offsetof(GSVertexSW, c)]);
// m_local.d8.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(ymm1, dscan_c, ymm3);
vcvttps2dq(ymm1, ymm1);
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(ymm1, ymm1);
vmovq(_rip_local(d8.c), xmm1);
// GSVector8 dr = dc.xxxx();
// GSVector8 db = dc.zzzz();
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
vmulps(ymm0, ymm2, _m_shift(1 + i));
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
vmulps(ymm1, ymm3, _m_shift(1 + i));
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(ymm0, ymm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].rb, variableOffset), ymm0);
}
// GSVector8 dg = dc.yyyy();
// GSVector8 da = dc.wwww();
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
vmulps(ymm0, ymm2, _m_shift(1 + i));
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
vmulps(ymm1, ymm3, _m_shift(1 + i));
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(ymm0, ymm1);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
vmovdqa(_rip_local_v(d[i].ga, variableOffset), ymm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * last]);
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
add(_vertex, a0);
}
vbroadcasti128(ymm0, ptr[_vertex + offsetof(GSVertexSW, c)]);
vcvttps2dq(ymm0, ymm0);
// c = c.upl16(c.zwxy());
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(ymm0, ymm1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
vpsrlw(ymm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(_rip_local(c.rb), ymm1);
vmovdqa(_rip_local(c.ga), ymm2);
}
}
#endif

View File

@@ -1,374 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#include "GS/GS_codegen.h"
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
void GSSetupPrimCodeGenerator::Generate_SSE()
{
#ifdef _WIN64
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
#endif
mov(t0, (size_t)&m_local);
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)g_const->m_shift_128b[0]);
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
movaps(Xmm(3 + i), ptr[rax + i * 16]);
}
}
Depth_SSE();
Texture_SSE();
Color_SSE();
#ifdef _WIN64
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
#endif
ret();
}
void GSSetupPrimCodeGenerator::Depth_SSE()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// GSVector4 df = p.wwww();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[t0 + variableOffset], xmm2);
}
}
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[t0 + variableOffset], xmm1);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
}
if (m_en.z)
{
// uint32 z is bypassed in t.w
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture_SSE()
{
if (!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
if (m_sel.fst)
{
// m_local.d4.stq = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
switch (j)
{
case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break;
case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
switch (j)
{
case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break;
case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break;
case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_SSE()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[t0 + variableOffset], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
movdqa(ptr[t0 + variableOffset], xmm2);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(eax, ptr[a1 + sizeof(uint32) * last]);
shl(eax, 6); // * sizeof(GSVertexSW)
add(rax, a0);
}
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
psrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
}
}
#endif

View File

@@ -1,335 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#include "GS/GS_codegen.h"
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 0;
static const int _vertex = _args + 4;
static const int _index = _args + 8;
static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate_AVX()
{
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(edx, dword[esp + _dscan]);
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
}
}
Depth_AVX();
Texture_AVX();
Color_AVX();
ret();
}
void GSSetupPrimCodeGenerator::Depth_AVX()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// GSVector4 df = p.wwww();
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d4.f], xmm2);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d[i].f], xmm2);
}
}
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_local.d4.z], xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.p.f], xmm1);
}
if (m_en.z)
{
// uint32 z is bypassed in t.w
vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
vmovdqa(ptr[&m_local.p.z], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture_AVX()
{
if (!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
vmulps(xmm1, xmm0, xmm3);
if (m_sel.fst)
{
// m_local.d4.stq = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.stq], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
vmovaps(ptr[&m_local.d4.stq], xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch (j)
{
case 0: vmovdqa(ptr[&m_local.d[i].s], xmm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].t], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch (j)
{
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_AVX()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(ptr[&m_local.d4.c], xmm1);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
}
vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
vpsrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.c.rb], xmm1);
vmovdqa(ptr[&m_local.c.ga], xmm2);
}
}
#endif

View File

@@ -1,360 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#include "GS/GS_codegen.h"
#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 0;
static const int _vertex = _args + 4;
static const int _index = _args + 8;
static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate_AVX2()
{
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(edx, dword[esp + _dscan]);
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Ymm(3 + i), ptr[g_const->m_shift_256b[i]]);
}
}
Depth_AVX2();
Texture_AVX2();
Color_AVX2();
ret();
}
void GSSetupPrimCodeGenerator::Depth_AVX2()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]);
vmulps(ymm1, ymm0, ymm3);
if (m_en.z)
{
// m_local.d8.p.z = dp8.extract32<2>();
vextractps(ptr[&m_local.d8.p.z], xmm1, 2);
}
if (m_en.f)
{
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
vcvtps2dq(ymm2, ymm1);
vpextrd(ptr[&m_local.d8.p.f], xmm2, 3);
}
if (m_en.z)
{
// GSVector8 dz = GSVector8(dscan.p).zzzz();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
}
if (m_en.f)
{
// GSVector8 df = GSVector8(dscan.p).wwww();
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
}
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
if (m_en.z)
{
// m_local.d[i].z = dz * shift[1 + i];
if (i < 4)
vmulps(ymm0, ymm2, Ymm(4 + i));
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
vmovaps(ptr[&m_local.d[i].z], ymm0);
}
if (m_en.f)
{
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
if (i < 4)
vmulps(ymm0, ymm1, Ymm(4 + i));
else
vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d[i].f], ymm0);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
if (m_en.f)
{
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
vcvttps2dq(xmm0, xmm0);
vpextrd(ptr[&m_local.p.f], xmm0, 3);
}
if (m_en.z)
{
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
mov(ptr[&m_local.p.z], eax);
}
}
}
void GSSetupPrimCodeGenerator::Texture_AVX2()
{
if (!m_en.t)
{
return;
}
// GSVector8 dt(dscan.t);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
// GSVector8 dt8 = dt * shift[0];
vmulps(ymm1, ymm0, ymm3);
if (m_sel.fst)
{
// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
vcvttps2dq(ymm1, ymm1);
vmovdqa(ptr[&m_local.d8.stq], xmm1);
}
else
{
// m_local.d8.stq = dt8;
vmovaps(ptr[&m_local.d8.stq], xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8 v = dstq * shift[1 + i];
if (i < 4)
vmulps(ymm2, ymm1, Ymm(4 + i));
else
vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
vcvttps2dq(ymm2, ymm2);
switch (j)
{
case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch (j)
{
case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_AVX2()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector8 dc(dscan.c);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
vmulps(ymm1, ymm0, ymm3);
vcvttps2dq(ymm1, ymm1);
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(ymm1, ymm1);
vmovq(ptr[&m_local.d8.c], xmm1);
// ymm3 is not needed anymore
// GSVector8 dr = dc.xxxx();
// GSVector8 db = dc.zzzz();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
if (i < 4)
vmulps(ymm0, ymm2, Ymm(4 + i));
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
if (i < 4)
vmulps(ymm1, ymm3, Ymm(4 + i));
else
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].rb = r.upl16(b);
vpunpcklwd(ymm0, ymm1);
vmovdqa(ptr[&m_local.d[i].rb], ymm0);
}
// GSVector8 dc(dscan.c);
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
// GSVector8 dg = dc.yyyy();
// GSVector8 da = dc.wwww();
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
{
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
if (i < 4)
vmulps(ymm0, ymm2, Ymm(4 + i));
else
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
if (i < 4)
vmulps(ymm1, ymm3, Ymm(4 + i));
else
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
// m_local.d[i].ga = g.upl16(a);
vpunpcklwd(ymm0, ymm1);
vmovdqa(ptr[&m_local.d[i].ga], ymm0);
}
}
else
{
// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
}
vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
vcvttps2dq(ymm0, ymm0);
// c = c.upl16(c.zwxy());
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(ymm0, ymm1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
vpsrlw(ymm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_local.c.rb], ymm1);
vmovdqa(ptr[&m_local.c.ga], ymm2);
}
}
#endif

View File

@@ -1,350 +0,0 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2021 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "GSSetupPrimCodeGenerator.h"
#include "GSVertexSW.h"
#include "GS/GS_codegen.h"
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
static const int _args = 0;
static const int _vertex = _args + 4;
static const int _index = _args + 8;
static const int _dscan = _args + 12;
void GSSetupPrimCodeGenerator::Generate_SSE()
{
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(edx, dword[esp + _dscan]);
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
movaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
}
}
Depth_SSE();
Texture_SSE();
Color_SSE();
ret();
}
void GSSetupPrimCodeGenerator::Depth_SSE()
{
if (!m_en.z && !m_en.f)
{
return;
}
if (m_sel.prim != GS_SPRITE_CLASS)
{
// GSVector4 p = dscan.p;
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// GSVector4 df = p.wwww();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d4.f], xmm2);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d[i].f], xmm2);
}
}
if (m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_local.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[&m_local.d4.z], xmm1);
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(ptr[&m_local.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertex[index[1]].p;
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
if (m_en.f)
{
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_local.p.f], xmm1);
}
if (m_en.z)
{
// uint32 z is bypassed in t.w
movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
movdqa(ptr[&m_local.p.z], xmm0);
}
}
}
void GSSetupPrimCodeGenerator::Texture_SSE()
{
if (!m_en.t)
{
return;
}
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
if (m_sel.fst)
{
// m_local.d4.stq = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(ptr[&m_local.d4.stq], xmm1);
}
else
{
// m_local.d4.stq = t * 4.0f;
movaps(ptr[&m_local.d4.stq], xmm1);
}
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
if (m_sel.fst)
{
// m_local.d[i].s/t = GSVector4i(v);
cvttps2dq(xmm2, xmm2);
switch (j)
{
case 0: movdqa(ptr[&m_local.d[i].s], xmm2); break;
case 1: movdqa(ptr[&m_local.d[i].t], xmm2); break;
}
}
else
{
// m_local.d[i].s/t/q = v;
switch (j)
{
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
}
}
}
}
}
void GSSetupPrimCodeGenerator::Color_SSE()
{
if (!m_en.c)
{
return;
}
if (m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
movaps(xmm1, xmm0);
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[&m_local.d4.c], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_local.d[i].rb], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_local.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_local.d[i].ga], xmm2);
}
}
else
{
// GSVector4i c = GSVector4i(vertex[index[last].c);
int last = 0;
switch (m_sel.prim)
{
case GS_POINT_CLASS: last = 0; break;
case GS_LINE_CLASS: last = 1; break;
case GS_TRIANGLE_CLASS: last = 2; break;
case GS_SPRITE_CLASS: last = 1; break;
}
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
{
mov(ecx, ptr[esp + _index]);
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
shl(ecx, 6); // * sizeof(GSVertexSW)
add(ecx, ptr[esp + _vertex]);
}
cvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
// c = c.upl16(c.zwxy());
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if (m_sel.tfx == TFX_NONE)
{
psrlw(xmm0, 7);
}
// m_local.c.rb = c.xxxx();
// m_local.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_local.c.rb], xmm1);
movdqa(ptr[&m_local.c.ga], xmm2);
}
}
#endif

View File

@@ -466,12 +466,8 @@
<ClCompile Include="GS\GSDrawingContext.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanline.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx2.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx2.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.cpp" />
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp" />
<ClCompile Include="GS\Renderers\SW\GSNewCodeGenerator.cpp" />
<ClCompile Include="GS\GSDump.cpp" />
<ClCompile Include="GS\Renderers\Common\GSFunctionMap.cpp" />
<ClCompile Include="GS\Renderers\HW\GSHwHack.cpp" />
@@ -490,12 +486,7 @@
<ClCompile Include="GS\Window\GSSetting.cpp" />
<ClCompile Include="GS\Window\GSSettingsDlg.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx2.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx2.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.cpp" />
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp" />
<ClCompile Include="GS\Renderers\OpenGL\GSShaderOGL.cpp" />
<ClCompile Include="GS\GSState.cpp" />
<ClCompile Include="GS\GSTables.cpp" />
@@ -815,7 +806,6 @@
<ClInclude Include="GS\Renderers\OpenGL\GLLoader.h" />
<ClInclude Include="GS\Renderers\OpenGL\GLState.h" />
<ClInclude Include="GS\GS.h" />
<ClInclude Include="GS\GS_codegen.h" />
<ClInclude Include="GS\GS_types.h" />
<ClInclude Include="GS\GSAlignedClass.h" />
<ClInclude Include="GS\GSBlock.h" />
@@ -834,6 +824,8 @@
<ClInclude Include="GS\GSDrawingEnvironment.h" />
<ClInclude Include="GS\Renderers\SW\GSDrawScanline.h" />
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.h" />
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h" />
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h" />
<ClInclude Include="GS\GSDump.h" />
<ClInclude Include="GS\Renderers\Common\GSFastList.h" />
<ClInclude Include="GS\Renderers\Common\GSFunctionMap.h" />
@@ -853,6 +845,7 @@
<ClInclude Include="GS\Window\GSSetting.h" />
<ClInclude Include="GS\Window\GSSettingsDlg.h" />
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.h" />
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h" />
<ClInclude Include="GS\Renderers\OpenGL\GSShaderOGL.h" />
<ClInclude Include="GS\GSState.h" />
<ClInclude Include="GS\GSTables.h" />

View File

@@ -1517,22 +1517,10 @@
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx.cpp">
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx2.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx2.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.cpp">
<ClCompile Include="GS\Renderers\SW\GSNewCodeGenerator.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSRendererSW.cpp">
@@ -1541,24 +1529,6 @@
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx2.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx2.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="GS\Renderers\SW\GSTextureCacheSW.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
@@ -2508,9 +2478,6 @@
<ClInclude Include="GS\GS.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
<ClInclude Include="GS\GS_codegen.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
<ClInclude Include="GS\GS_types.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
@@ -2631,6 +2598,12 @@
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSRendererSW.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
@@ -2640,6 +2613,9 @@
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>
<ClInclude Include="GS\Renderers\SW\GSTextureCacheSW.h">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClInclude>