mirror of
https://github.com/PCSX2/pcsx2.git
synced 2026-01-31 01:15:24 +01:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fd145e65aa | ||
|
|
6596b7f27e | ||
|
|
9d767838d6 | ||
|
|
f55219bb1b | ||
|
|
805b647c73 | ||
|
|
fd0351ca8f | ||
|
|
ed5a7802f3 | ||
|
|
44f8317b7e | ||
|
|
0200933ddd |
@@ -31,6 +31,10 @@
|
||||
|
||||
#include "common/emitter/x86_intrin.h"
|
||||
|
||||
// The C++ standard doesn't allow `offsetof` to be used on non-constant values (e.g. `offsetof(class, field[i])`)
|
||||
// Use this in those situations
|
||||
#define OFFSETOF(a, b) (reinterpret_cast<size_t>(&(static_cast<a*>(0)->b)))
|
||||
|
||||
// Renamed ARRAYSIZE to ArraySize -- looks nice and gets rid of Windows.h conflicts (air)
|
||||
// Notes: I'd have used ARRAY_SIZE instead but ran into cross-platform lib conflicts with
|
||||
// that as well. >_<
|
||||
|
||||
@@ -639,21 +639,12 @@ set(pcsx2GSSources
|
||||
GS/Renderers/HW/GSTextureCache.cpp
|
||||
GS/Renderers/SW/GSDrawScanline.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x64.avx2.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.x86.avx2.cpp
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
|
||||
GS/Renderers/SW/GSNewCodeGenerator.cpp
|
||||
GS/Renderers/SW/GSRasterizer.cpp
|
||||
GS/Renderers/SW/GSRendererSW.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.avx.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x64.avx2.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.avx.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.x86.avx2.cpp
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
|
||||
GS/Renderers/SW/GSTextureCacheSW.cpp
|
||||
GS/Renderers/SW/GSTextureSW.cpp
|
||||
GS/Renderers/OpenGL/GLLoader.cpp
|
||||
@@ -679,7 +670,6 @@ set(pcsx2GSHeaders
|
||||
GS/GSDrawingEnvironment.h
|
||||
GS/GSDump.h
|
||||
GS/GS_types.h
|
||||
GS/GS_codegen.h
|
||||
GS/GS.h
|
||||
GS/GSLocalMemory.h
|
||||
GS/GSLzma.h
|
||||
@@ -712,11 +702,14 @@ set(pcsx2GSHeaders
|
||||
GS/Renderers/HW/GSTextureCache.h
|
||||
GS/Renderers/HW/GSVertexHW.h
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.h
|
||||
GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
|
||||
GS/Renderers/SW/GSDrawScanline.h
|
||||
GS/Renderers/SW/GSNewCodeGenerator.h
|
||||
GS/Renderers/SW/GSRasterizer.h
|
||||
GS/Renderers/SW/GSRendererSW.h
|
||||
GS/Renderers/SW/GSScanlineEnvironment.h
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.h
|
||||
GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
|
||||
GS/Renderers/SW/GSTextureCacheSW.h
|
||||
GS/Renderers/SW/GSTextureSW.h
|
||||
GS/Renderers/SW/GSVertexSW.h
|
||||
|
||||
@@ -18,10 +18,11 @@
|
||||
template <int i>
|
||||
class GSAlignedClass
|
||||
{
|
||||
public:
|
||||
GSAlignedClass() {}
|
||||
virtual ~GSAlignedClass() {}
|
||||
protected:
|
||||
GSAlignedClass() = default;
|
||||
~GSAlignedClass() = default;
|
||||
|
||||
public:
|
||||
void* operator new(size_t size)
|
||||
{
|
||||
return _aligned_malloc(size, i);
|
||||
|
||||
@@ -110,11 +110,7 @@ extern void vmfree(void* ptr, size_t size);
|
||||
|
||||
// Convert gcc see define into GS (windows) define
|
||||
#if defined(__AVX2__)
|
||||
#if defined(__x86_64__)
|
||||
#define _M_SSE 0x500 // TODO
|
||||
#else
|
||||
#define _M_SSE 0x501
|
||||
#endif
|
||||
#define _M_SSE 0x501
|
||||
#elif defined(__AVX__)
|
||||
#define _M_SSE 0x500
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
3508
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
Normal file
3508
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
Normal file
File diff suppressed because it is too large
Load Diff
189
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
Normal file
189
pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h
Normal file
@@ -0,0 +1,189 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GSScanlineEnvironment.h"
|
||||
#include "GSNewCodeGenerator.h"
|
||||
|
||||
#undef _t // Conflict with wx, hopefully no one needs this
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Ymm
|
||||
#define DRAW_SCANLINE_USING_XMM 0
|
||||
#define DRAW_SCANLINE_USING_YMM 1
|
||||
#else
|
||||
#define DRAW_SCANLINE_VECTOR_REGISTER Xbyak::Xmm
|
||||
#define DRAW_SCANLINE_USING_XMM 1
|
||||
#define DRAW_SCANLINE_USING_YMM 0
|
||||
#endif
|
||||
|
||||
class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator
|
||||
{
|
||||
using _parent = GSNewCodeGenerator;
|
||||
using XYm = DRAW_SCANLINE_VECTOR_REGISTER;
|
||||
|
||||
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
|
||||
/// On x86-32 the same values are just raw 32-bit addresses
|
||||
using LocalAddr = Choose3264<size_t, AddressReg>::type;
|
||||
|
||||
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
|
||||
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
|
||||
constexpr static int wordsize = is64 ? 8 : 4;
|
||||
constexpr static int vecsize = isXmm ? 16 : 32;
|
||||
constexpr static int vecsizelog = isXmm ? 4 : 5;
|
||||
constexpr static int vecints = vecsize / 4;
|
||||
|
||||
|
||||
// MARK: - Constants
|
||||
|
||||
constexpr static int _32_args = 16;
|
||||
constexpr static int _invalid = 0xaaaaaaaa;
|
||||
#ifdef _WIN32
|
||||
constexpr static int _64_top = 8 * 0;
|
||||
// XMM registers will be saved to `rsp + _64_win_xmm_start + id - 6`
|
||||
// Which will put xmm6 after the temporaries, them xmm7, etc
|
||||
constexpr static int _64_win_xmm_start = 8 * 2;
|
||||
// Windows has no redzone and also has 10 xmm registers to save
|
||||
constexpr static int _64_win_stack_size = _64_win_xmm_start + 16 * 10;
|
||||
#else
|
||||
// System-V has a redzone so stick everything there
|
||||
constexpr static int _64_rz_rbx = -8 * 1;
|
||||
constexpr static int _64_rz_r12 = -8 * 2;
|
||||
constexpr static int _64_rz_r13 = -8 * 3;
|
||||
constexpr static int _64_rz_r14 = -8 * 4;
|
||||
constexpr static int _64_rz_r15 = -8 * 5;
|
||||
constexpr static int _64_top = -8 * 6;
|
||||
#endif
|
||||
constexpr static int _top = is64 ? _64_top : _32_args + 4;
|
||||
constexpr static int _v = is64 ? _invalid : _32_args + 8;
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
bool use_lod;
|
||||
|
||||
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
|
||||
/// Note: a2 and t3 are only available on x86-64
|
||||
/// Outside of Init, usable registers are a0, t0, t1, t2, t3[x64], rax, rbx, rdx, r10+
|
||||
const AddressReg a0, a1, a2, a3, t0, t1, t2, t3;
|
||||
const LocalAddr _g_const, _m_local, _m_local__gd, _m_local__gd__vm;
|
||||
/// Available on both x86 and x64, not always valid
|
||||
const XYm _rb, _ga, _fm, _zm, _fd, _test;
|
||||
/// Always valid if needed, x64 only
|
||||
const XYm _z, _f, _s, _t, _q, _f_rb, _f_ga;
|
||||
|
||||
/// Returns the first arg on 32-bit, second on 64-bit
|
||||
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
|
||||
{
|
||||
return choose3264((size_t)addr32, reg64);
|
||||
}
|
||||
|
||||
public:
|
||||
GSDrawScanlineCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key);
|
||||
void Generate();
|
||||
|
||||
private:
|
||||
/// Loads the given address into the given register if needed, and returns something that can be used in a `ptr[]`
|
||||
LocalAddr loadAddress(AddressReg reg, const void* addr);
|
||||
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
|
||||
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast 128 bits of integers from memory to the whole register, whatever size that register might be
|
||||
void broadcasti128(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a floating-point variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 32-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void broadcastssLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a qword variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 64-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastqLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a dword variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 32-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastdLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a word variable stored in GSScanlineLocalData to the whole register
|
||||
/// On YMM registers this will be a broadcast from a 16-bit value
|
||||
/// On XMM registers this will be a load of a full 128-bit value, with the broadcast happening before storing to the local data
|
||||
void pbroadcastwLocal(const XYm& reg, const Xbyak::Address& mem);
|
||||
/// Broadcast a 32-bit GPR to a vector register
|
||||
void broadcastGPRToVec(const XYm& vec, const Xbyak::Reg32& gpr);
|
||||
void modulate16(const XYm& a, const Xbyak::Operand& f, uint8 shift);
|
||||
void lerp16(const XYm& a, const XYm& b, const XYm& f, uint8 shift);
|
||||
void lerp16_4(const XYm& a, const XYm& b, const XYm& f);
|
||||
void mix16(const XYm& a, const XYm& b, const XYm& temp);
|
||||
void clamp16(const XYm& a, const XYm& temp);
|
||||
void alltrue(const XYm& test);
|
||||
void blend(const XYm& a, const XYm& b, const XYm& mask);
|
||||
void blendr(const XYm& b, const XYm& a, const XYm& mask);
|
||||
void blend8(const XYm& a, const XYm& b);
|
||||
void blend8r(const XYm& b, const XYm& a);
|
||||
void split16_2x8(const XYm& l, const XYm& h, const XYm& src);
|
||||
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const XYm& temp1, const XYm& temp2);
|
||||
void SampleTexture();
|
||||
void SampleTexture_TexelReadHelper(int mip_offset);
|
||||
void Wrap(const XYm& uv);
|
||||
void Wrap(const XYm& uv0, const XYm& uv1);
|
||||
void SampleTextureLOD();
|
||||
void WrapLOD(const XYm& uv);
|
||||
void WrapLOD(const XYm& uv0, const XYm& uv1);
|
||||
void AlphaTFX();
|
||||
void ReadMask();
|
||||
void TestAlpha();
|
||||
void ColorTFX();
|
||||
void Fog();
|
||||
void ReadFrame();
|
||||
void TestDestAlpha();
|
||||
void WriteMask();
|
||||
void WriteZBuf();
|
||||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
void ReadPixel(const XYm& dst, const XYm& tmp, const AddressReg& addr);
|
||||
#if DRAW_SCANLINE_USING_XMM
|
||||
void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg8& mask, bool fast, int psm, int fz);
|
||||
#else
|
||||
void WritePixel(const XYm& src_, const AddressReg& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz);
|
||||
#endif
|
||||
void WritePixel(const Xmm& src, const AddressReg& addr, uint8 i, uint8 j, int psm);
|
||||
void ReadTexel1(const XYm& dst, const XYm& src, const XYm& tmp1, const XYm& tmp2, int mip_offset);
|
||||
void ReadTexel4(
|
||||
const XYm& d0, const XYm& d1,
|
||||
const XYm& d2s0, const XYm& d3s1,
|
||||
const XYm& s2, const XYm& s3,
|
||||
const XYm& tmp1, const XYm& tmp2,
|
||||
int mip_offset);
|
||||
void ReadTexelImpl(
|
||||
const XYm& d0, const XYm& d1,
|
||||
const XYm& d2s0, const XYm& d3s1,
|
||||
const XYm& s2, const XYm& s3,
|
||||
const XYm& tmp1, const XYm& tmp2,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImplLoadTexLOD(int lod, int mip_offset);
|
||||
void ReadTexelImplYmm(
|
||||
const Ymm& d0, const Ymm& d1,
|
||||
const Ymm& d2s0, const Ymm& d3s1,
|
||||
const Ymm& s2, const Ymm& s3,
|
||||
const Ymm& tmp,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImplSSE4(
|
||||
const Xmm& d0, const Xmm& d1,
|
||||
const Xmm& d2s0, const Xmm& d3s1,
|
||||
const Xmm& s2, const Xmm& s3,
|
||||
int pixels, int mip_offset);
|
||||
void ReadTexelImpl(const Xmm& dst, const Xmm& addr, uint8 i, bool texInA3, bool preserveDst);
|
||||
};
|
||||
@@ -15,17 +15,8 @@
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
#include "GSDrawScanlineCodeGenerator.all.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#else
|
||||
void GSDrawScanlineCodeGenerator::Generate()
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
Generate_AVX();
|
||||
else
|
||||
Generate_SSE();
|
||||
}
|
||||
#endif
|
||||
|
||||
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
|
||||
: GSCodeGenerator(code, maxsize)
|
||||
@@ -37,227 +28,5 @@ GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key
|
||||
if (m_sel.breakpoint)
|
||||
db(0xCC);
|
||||
|
||||
try
|
||||
{
|
||||
Generate();
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
fprintf(stderr, "ERR:GSDrawScanlineCodeGenerator %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::modulate16(const Xmm& a, const Operand& f, uint8 shift)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
if (shift == 0)
|
||||
{
|
||||
vpmulhrsw(a, f);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpsllw(a, shift + 1);
|
||||
vpmulhw(a, f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (shift == 0 && m_cpu.has(Xbyak::util::Cpu::tSSSE3))
|
||||
{
|
||||
pmulhrsw(a, f);
|
||||
}
|
||||
else
|
||||
{
|
||||
psllw(a, shift + 1);
|
||||
pmulhw(a, f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpsubw(a, b);
|
||||
modulate16(a, f, shift);
|
||||
vpaddw(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
psubw(a, b);
|
||||
modulate16(a, f, shift);
|
||||
paddw(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpsubw(a, b);
|
||||
vpmullw(a, f);
|
||||
vpsraw(a, 4);
|
||||
vpaddw(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
psubw(a, b);
|
||||
pmullw(a, f);
|
||||
psraw(a, 4);
|
||||
paddw(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpblendw(a, b, 0xaa);
|
||||
}
|
||||
else
|
||||
{
|
||||
pblendw(a, b, 0xaa);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::clamp16(const Xmm& a, const Xmm& temp)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpackuswb(a, a);
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
// Greg: why ?
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX2))
|
||||
{
|
||||
ASSERT(a.isYMM());
|
||||
vpermq(Ymm(a.getIdx()), Ymm(a.getIdx()), _MM_SHUFFLE(3, 1, 2, 0)); // this sucks
|
||||
}
|
||||
#endif
|
||||
|
||||
vpmovzxbw(a, a);
|
||||
}
|
||||
else
|
||||
{
|
||||
packuswb(a, a);
|
||||
pmovzxbw(a, a);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::alltrue(const Xmm& test)
|
||||
{
|
||||
uint32 mask = test.isYMM() ? 0xffffffff : 0xffff;
|
||||
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpmovmskb(eax, test);
|
||||
cmp(eax, mask);
|
||||
je("step", T_NEAR);
|
||||
}
|
||||
else
|
||||
{
|
||||
pmovmskb(eax, test);
|
||||
cmp(eax, mask);
|
||||
je("step", T_NEAR);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend(const Xmm& a, const Xmm& b, const Xmm& mask)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpand(b, mask);
|
||||
vpandn(mask, a);
|
||||
vpor(a, b, mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
pand(b, mask);
|
||||
pandn(mask, a);
|
||||
por(b, mask);
|
||||
movdqa(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blendr(const Xmm& b, const Xmm& a, const Xmm& mask)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpand(b, mask);
|
||||
vpandn(mask, a);
|
||||
vpor(b, mask);
|
||||
}
|
||||
else
|
||||
{
|
||||
pand(b, mask);
|
||||
pandn(mask, a);
|
||||
por(b, mask);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend8(const Xmm& a, const Xmm& b)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
vpblendvb(a, a, b, xmm0);
|
||||
else
|
||||
pblendvb(a, b);
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::blend8r(const Xmm& b, const Xmm& a)
|
||||
{
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
vpblendvb(b, a, b, xmm0);
|
||||
}
|
||||
else
|
||||
{
|
||||
pblendvb(a, b);
|
||||
movdqa(b, a);
|
||||
}
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src)
|
||||
{
|
||||
// l = src & 0xFF; (1 left shift + 1 right shift)
|
||||
// h = (src >> 8) & 0xFF; (1 right shift)
|
||||
|
||||
if (m_cpu.has(Xbyak::util::Cpu::tAVX))
|
||||
{
|
||||
if (src == h)
|
||||
{
|
||||
vpsllw(l, src, 8);
|
||||
vpsrlw(h, 8);
|
||||
}
|
||||
else if (src == l)
|
||||
{
|
||||
vpsrlw(h, src, 8);
|
||||
vpsllw(l, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
vpsllw(l, src, 8);
|
||||
vpsrlw(h, src, 8);
|
||||
}
|
||||
vpsrlw(l, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (src == h)
|
||||
{
|
||||
movdqa(l, src);
|
||||
}
|
||||
else if (src == l)
|
||||
{
|
||||
movdqa(h, src);
|
||||
}
|
||||
else
|
||||
{
|
||||
movdqa(l, src);
|
||||
movdqa(h, src);
|
||||
}
|
||||
psllw(l, 8);
|
||||
psrlw(l, 8);
|
||||
psrlw(h, 8);
|
||||
}
|
||||
GSDrawScanlineCodeGenerator2(this, CPUInfo(m_cpu), (void*)&m_local, m_sel.key).Generate();
|
||||
}
|
||||
|
||||
@@ -27,117 +27,12 @@
|
||||
|
||||
class GSDrawScanlineCodeGenerator : public GSCodeGenerator
|
||||
{
|
||||
typedef Xbyak::Ymm Ymm;
|
||||
typedef Xbyak::Xmm Xmm;
|
||||
typedef Xbyak::Reg8 Reg8;
|
||||
typedef Xbyak::Operand Operand;
|
||||
|
||||
void operator=(const GSDrawScanlineCodeGenerator&);
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
|
||||
void Generate();
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
|
||||
void Init();
|
||||
void Step();
|
||||
void TestZ(const Ymm& temp1, const Ymm& temp2);
|
||||
void SampleTexture();
|
||||
void Wrap(const Ymm& uv0);
|
||||
void Wrap(const Ymm& uv0, const Ymm& uv1);
|
||||
void SampleTextureLOD();
|
||||
void WrapLOD(const Ymm& uv0);
|
||||
void WrapLOD(const Ymm& uv0, const Ymm& uv1);
|
||||
void AlphaTFX();
|
||||
void ReadMask();
|
||||
void TestAlpha();
|
||||
void ColorTFX();
|
||||
void Fog();
|
||||
void ReadFrame();
|
||||
void TestDestAlpha();
|
||||
void WriteMask();
|
||||
void WriteZBuf();
|
||||
void AlphaBlend();
|
||||
void WriteFrame();
|
||||
void ReadPixel(const Ymm& dst, const Ymm& temp, const RegLong& addr);
|
||||
void WritePixel(const Ymm& src, const Ymm& temp, const RegLong& addr, const Xbyak::Reg32& mask, bool fast, int psm, int fz);
|
||||
void WritePixel(const Xmm& src, const RegLong& addr, uint8 i, uint8 j, int psm);
|
||||
void ReadTexel(int pixels, int mip_offset = 0);
|
||||
void ReadTexel(const Ymm& dst, const Ymm& addr, uint8 i);
|
||||
|
||||
#else
|
||||
|
||||
void Generate_SSE();
|
||||
void Init_SSE();
|
||||
void Step_SSE();
|
||||
void TestZ_SSE(const Xmm& temp1, const Xmm& temp2);
|
||||
void SampleTexture_SSE();
|
||||
void Wrap_SSE(const Xmm& uv0);
|
||||
void Wrap_SSE(const Xmm& uv0, const Xmm& uv1);
|
||||
void SampleTextureLOD_SSE();
|
||||
void WrapLOD_SSE(const Xmm& uv0);
|
||||
void WrapLOD_SSE(const Xmm& uv0, const Xmm& uv1);
|
||||
void AlphaTFX_SSE();
|
||||
void ReadMask_SSE();
|
||||
void TestAlpha_SSE();
|
||||
void ColorTFX_SSE();
|
||||
void Fog_SSE();
|
||||
void ReadFrame_SSE();
|
||||
void TestDestAlpha_SSE();
|
||||
void WriteMask_SSE();
|
||||
void WriteZBuf_SSE();
|
||||
void AlphaBlend_SSE();
|
||||
void WriteFrame_SSE();
|
||||
void ReadPixel_SSE(const Xmm& dst, const RegLong& addr);
|
||||
void WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
|
||||
void WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm);
|
||||
void ReadTexel_SSE(int pixels, int mip_offset = 0);
|
||||
void ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i);
|
||||
|
||||
void Generate_AVX();
|
||||
void Init_AVX();
|
||||
void Step_AVX();
|
||||
void TestZ_AVX(const Xmm& temp1, const Xmm& temp2);
|
||||
void SampleTexture_AVX();
|
||||
void Wrap_AVX(const Xmm& uv0);
|
||||
void Wrap_AVX(const Xmm& uv0, const Xmm& uv1);
|
||||
void SampleTextureLOD_AVX();
|
||||
void WrapLOD_AVX(const Xmm& uv0);
|
||||
void WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1);
|
||||
void AlphaTFX_AVX();
|
||||
void ReadMask_AVX();
|
||||
void TestAlpha_AVX();
|
||||
void ColorTFX_AVX();
|
||||
void Fog_AVX();
|
||||
void ReadFrame_AVX();
|
||||
void TestDestAlpha_AVX();
|
||||
void WriteMask_AVX();
|
||||
void WriteZBuf_AVX();
|
||||
void AlphaBlend_AVX();
|
||||
void WriteFrame_AVX();
|
||||
void ReadPixel_AVX(const Xmm& dst, const RegLong& addr);
|
||||
void WritePixel_AVX(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz);
|
||||
void WritePixel_AVX(const Xmm& src, const RegLong& addr, uint8 i, int psm);
|
||||
void ReadTexel_AVX(int pixels, int mip_offset = 0);
|
||||
void ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i);
|
||||
|
||||
#endif
|
||||
|
||||
void modulate16(const Xmm& a, const Operand& f, uint8 shift);
|
||||
void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, uint8 shift);
|
||||
void lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f);
|
||||
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
|
||||
void clamp16(const Xmm& a, const Xmm& temp);
|
||||
void alltrue(const Xmm& test);
|
||||
void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
|
||||
void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
|
||||
void blend8(const Xmm& a, const Xmm& b);
|
||||
void blend8r(const Xmm& b, const Xmm& a);
|
||||
void split16_2x8(const Xmm& l, const Xmm& h, const Xmm& src);
|
||||
|
||||
public:
|
||||
GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,118 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSDrawScanlineCodeGenerator.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
// It is useless to port the code to SSEx, better use the faster 32 bits version instead
|
||||
void GSDrawScanlineCodeGenerator::Generate_SSE()
|
||||
{
|
||||
// Avoid a crash if someone want to use it
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Init_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Step_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestZ_SSE(const Xmm& temp1, const Xmm& temp2)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::SampleTexture_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Wrap_SSE(const Xmm& uv0, const Xmm& uv1)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaTFX_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadMask_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestAlpha_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ColorTFX_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::Fog_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadFrame_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::TestDestAlpha_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteMask_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteZBuf_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::AlphaBlend_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WriteFrame_SSE()
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadPixel_SSE(const Xmm& dst, const RegLong& addr)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, const Reg8& mask, bool fast, int psm, int fz)
|
||||
{
|
||||
}
|
||||
|
||||
//static const int s_offsets[4] = {0, 2, 8, 10};
|
||||
|
||||
void GSDrawScanlineCodeGenerator::WritePixel_SSE(const Xmm& src, const RegLong& addr, uint8 i, int psm)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(int pixels, int mip_offset)
|
||||
{
|
||||
}
|
||||
|
||||
void GSDrawScanlineCodeGenerator::ReadTexel_SSE(const Xmm& dst, const Xmm& addr, uint8 i)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -13,26 +13,5 @@
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
#ifdef _M_AMD64
|
||||
// Yeah let use mips naming ;)
|
||||
#ifdef _WIN64
|
||||
#define a0 rcx
|
||||
#define a1 rdx
|
||||
#define a2 r8
|
||||
#define a3 r9
|
||||
#define t0 rdi
|
||||
#define t1 rsi
|
||||
#else
|
||||
#define a0 rdi
|
||||
#define a1 rsi
|
||||
#define a2 rdx
|
||||
#define a3 rcx
|
||||
#define t0 r8
|
||||
#define t1 r9
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSNewCodeGenerator.h"
|
||||
489
pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
Normal file
489
pcsx2/GS/Renderers/SW/GSNewCodeGenerator.h
Normal file
@@ -0,0 +1,489 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GS/GS_types.h"
|
||||
#include "xbyak/xbyak.h"
|
||||
#include "xbyak/xbyak_util.h"
|
||||
|
||||
namespace SSEVersion
|
||||
{
|
||||
enum SSEVersion
|
||||
{
|
||||
AVX2 = 0x501,
|
||||
AVX = 0x500,
|
||||
SSE41 = 0x401,
|
||||
};
|
||||
}
|
||||
|
||||
/// Similar to Xbyak::util::cpu but more open to us putting in extra flags (e.g. "vpgatherdd is fast"), as well as making it easier to test other configurations by artifically limiting features
|
||||
struct CPUInfo
|
||||
{
|
||||
bool hasFMA = false;
|
||||
SSEVersion::SSEVersion sseVersion = SSEVersion::SSE41;
|
||||
|
||||
CPUInfo() = default;
|
||||
CPUInfo(const Xbyak::util::Cpu& cpu)
|
||||
{
|
||||
auto version = SSEVersion::SSE41;
|
||||
if (cpu.has(cpu.tAVX))
|
||||
version = SSEVersion::AVX;
|
||||
if (cpu.has(cpu.tAVX2))
|
||||
version = SSEVersion::AVX2;
|
||||
|
||||
hasFMA = cpu.has(cpu.tFMA);
|
||||
sseVersion = version;
|
||||
}
|
||||
};
|
||||
|
||||
/// Code generator that automatically selects between SSE and AVX, x86 and x64 so you don't have to
|
||||
/// Should make combined SSE and AVX codegen much easier
|
||||
class GSNewCodeGenerator
|
||||
{
|
||||
public:
|
||||
using Address = Xbyak::Address;
|
||||
using Label = Xbyak::Label;
|
||||
using Operand = Xbyak::Operand;
|
||||
using Reg32e = Xbyak::Reg32e;
|
||||
using Reg32 = Xbyak::Reg32;
|
||||
using Reg16 = Xbyak::Reg16;
|
||||
using Reg8 = Xbyak::Reg8;
|
||||
using Reg = Xbyak::Reg;
|
||||
using Xmm = Xbyak::Xmm;
|
||||
using Ymm = Xbyak::Ymm;
|
||||
using Zmm = Xbyak::Zmm;
|
||||
|
||||
class Error : public std::exception
|
||||
{
|
||||
public:
|
||||
enum Value
|
||||
{
|
||||
ERR_64_BIT_REG_IN_32,
|
||||
ERR_64_INSTR_IN_32,
|
||||
ERR_SSE_INSTR_IN_AVX,
|
||||
ERR_AVX_INSTR_IN_SSE,
|
||||
};
|
||||
|
||||
Value value;
|
||||
|
||||
Error(Value value) : value(value) {}
|
||||
|
||||
const char* what() const noexcept
|
||||
{
|
||||
static const char* tbl[] = {
|
||||
"used 64-bit register in 32-bit code",
|
||||
"used 64-bit only instruction in 32-bit code",
|
||||
"used SSE instruction in AVX code",
|
||||
"used AVX instruction in SSE code",
|
||||
};
|
||||
if (static_cast<uint32>(value) < (sizeof(tbl) / sizeof(*tbl)))
|
||||
{
|
||||
return tbl[value];
|
||||
}
|
||||
else
|
||||
{
|
||||
return "GSNewCodeGenerator Unknown Error";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private:
|
||||
/// Make sure the register is okay to use
|
||||
void validateRegister(const Operand& op)
|
||||
{
|
||||
if (is64)
|
||||
return;
|
||||
if (op.isREG() && (op.isExtIdx() || op.isExt8bit()))
|
||||
throw Error(Error::ERR_64_BIT_REG_IN_32);
|
||||
if (op.isMEM())
|
||||
{
|
||||
auto e = static_cast<const Address&>(op).getRegExp();
|
||||
validateRegister(e.getIndex());
|
||||
validateRegister(e.getBase());
|
||||
}
|
||||
}
|
||||
/// For easier macro-ing
|
||||
void validateRegister(int imm)
|
||||
{
|
||||
}
|
||||
|
||||
void require64()
|
||||
{
|
||||
if (!is64)
|
||||
throw Error(Error::ERR_64_INSTR_IN_32);
|
||||
}
|
||||
void requireAVX()
|
||||
{
|
||||
if (!hasAVX)
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
}
|
||||
|
||||
public:
|
||||
Xbyak::CodeGenerator& actual;
|
||||
|
||||
#if defined(_M_X86_64)
|
||||
constexpr static bool is32 = false;
|
||||
constexpr static bool is64 = true;
|
||||
using AddressReg = Xbyak::Reg64;
|
||||
using RipType = Xbyak::RegRip;
|
||||
|
||||
template <typename T32, typename T64>
|
||||
struct Choose3264 { using type = T64; };
|
||||
|
||||
template <typename T32, typename T64>
|
||||
static T64 choose3264(T32 t32, T64 t64) { return t64; }
|
||||
#else
|
||||
constexpr static bool is32 = true;
|
||||
constexpr static bool is64 = false;
|
||||
using AddressReg = Xbyak::Reg32;
|
||||
using RipType = int;
|
||||
|
||||
template <typename T32, typename T64>
|
||||
struct Choose3264 { using type = T32; };
|
||||
|
||||
template <typename T32, typename T64>
|
||||
static T32 choose3264(T32 t32, T64 t64) { return t32; }
|
||||
#endif
|
||||
|
||||
const bool hasAVX, hasAVX2, hasFMA;
|
||||
|
||||
const Xmm xmm0{0}, xmm1{1}, xmm2{2}, xmm3{3}, xmm4{4}, xmm5{5}, xmm6{6}, xmm7{7}, xmm8{8}, xmm9{9}, xmm10{10}, xmm11{11}, xmm12{12}, xmm13{13}, xmm14{14}, xmm15{15};
|
||||
const Ymm ymm0{0}, ymm1{1}, ymm2{2}, ymm3{3}, ymm4{4}, ymm5{5}, ymm6{6}, ymm7{7}, ymm8{8}, ymm9{9}, ymm10{10}, ymm11{11}, ymm12{12}, ymm13{13}, ymm14{14}, ymm15{15};
|
||||
const AddressReg rax{0}, rcx{1}, rdx{2}, rbx{3}, rsp{4}, rbp{5}, rsi{6}, rdi{7}, r8{8}, r9{9}, r10{10}, r11{11}, r12{12}, r13{13}, r14{14}, r15{15};
|
||||
const Reg32 eax{0}, ecx{1}, edx{2}, ebx{3}, esp{4}, ebp{5}, esi{6}, edi{7}, r8d{8}, r9d{9}, r10d{10}, r11d{11}, r12d{12}, r13d{13}, r14d{14}, r15d{15};
|
||||
const Reg16 ax{0}, cx{1}, dx{2}, bx{3}, sp{4}, bp{5}, si{6}, di{7};
|
||||
const Reg8 al{0}, cl{1}, dl{2}, bl{3}, ah{4}, ch{5}, dh{6}, bh{7};
|
||||
|
||||
const RipType rip{};
|
||||
const Xbyak::AddressFrame ptr{0}, byte{8}, word{16}, dword{32}, qword{64}, xword{128}, yword{256}, zword{512};
|
||||
|
||||
GSNewCodeGenerator(Xbyak::CodeGenerator* actual, CPUInfo cpu)
|
||||
: actual(*actual)
|
||||
, hasAVX(cpu.sseVersion >= SSEVersion::AVX)
|
||||
, hasAVX2(cpu.sseVersion >= SSEVersion::AVX2)
|
||||
, hasFMA(cpu.hasFMA)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
// ------------ Forwarding instructions ------------
|
||||
// Note: Only instructions used by codegen were added here, so if you're modifying codegen, you may need to add instructions here
|
||||
|
||||
// For instructions available in SSE and AVX, functions with the SSE name and arguments that forward to SSE or AVX depending on the target, as well as functions with the AVX name and arguments that forward to the AVX version or assert on SSE
|
||||
|
||||
// ARGS_* macros are provided for shorter argument lists. The following single-letter abbreviations are used: X=Xmm, Y=Ymm, O=Operand, A=Address, I=Immediate
|
||||
// FORWARD(argcount, category, instrname, argtypes...) forwards an instruction. The following categories are available:
|
||||
// BASE: non-SSE
|
||||
// SSE: available on SSE and v-prefixed on AVX
|
||||
// SSEONLY: available only on SSE (exception on AVX)
|
||||
// AVX: available only on AVX (exception on SSE)
|
||||
// AVX2: available only on AVX2 (exception on AVX/SSE)
|
||||
// FMA: available only with FMA
|
||||
// SFORWARD forwards an SSE-AVX pair where the AVX variant takes the same number of registers (e.g. pshufd dst, src + vpshufd dst, src)
|
||||
// AFORWARD forwards an SSE-AVX pair where the AVX variant takes an extra destination register (e.g. shufps dst, src + vshufps dst, src, src)
|
||||
|
||||
// Implementation details:
|
||||
// ACTUAL_FORWARD_*: Actually forward the function of the given type
|
||||
// FORWARD#: First validates the arguments (e.g. make sure you're not passing registers over 7 on x86), then forwards to an ACTUAL_FORWARD_*
|
||||
|
||||
// Big thanks to https://stackoverflow.com/a/24028231 for helping me figure out how to work around MSVC's terrible macro expander
|
||||
// Of course GCC/Clang don't like the workaround so enjoy the ifdefs
|
||||
#define EXPAND_ARGS(macro, args) macro args
|
||||
|
||||
#define ACTUAL_FORWARD_BASE(name, ...) \
|
||||
actual.name(__VA_ARGS__);
|
||||
|
||||
#define ACTUAL_FORWARD_SSE(name, ...) \
|
||||
if (hasAVX) \
|
||||
actual.v##name(__VA_ARGS__); \
|
||||
else \
|
||||
actual.name(__VA_ARGS__);
|
||||
|
||||
#define ACTUAL_FORWARD_SSEONLY(name, ...) \
|
||||
if (hasAVX) \
|
||||
throw Error(Error::ERR_SSE_INSTR_IN_AVX); \
|
||||
else \
|
||||
actual.name(__VA_ARGS__);
|
||||
|
||||
#define ACTUAL_FORWARD_AVX(name, ...) \
|
||||
if (hasAVX) \
|
||||
actual.name(__VA_ARGS__); \
|
||||
else \
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
|
||||
#define ACTUAL_FORWARD_AVX2(name, ...) \
|
||||
if (hasAVX2) \
|
||||
actual.name(__VA_ARGS__); \
|
||||
else \
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
|
||||
#define ACTUAL_FORWARD_FMA(name, ...) \
|
||||
if (hasFMA) \
|
||||
actual.name(__VA_ARGS__); \
|
||||
else \
|
||||
throw Error(Error::ERR_AVX_INSTR_IN_SSE);
|
||||
|
||||
#define FORWARD1(category, name, type) \
|
||||
void name(type a) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
ACTUAL_FORWARD_##category(name, a) \
|
||||
}
|
||||
|
||||
#define FORWARD2(category, name, type1, type2) \
|
||||
void name(type1 a, type2 b) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
ACTUAL_FORWARD_##category(name, a, b) \
|
||||
}
|
||||
|
||||
#define FORWARD3(category, name, type1, type2, type3) \
|
||||
void name(type1 a, type2 b, type3 c) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
validateRegister(c); \
|
||||
ACTUAL_FORWARD_##category(name, a, b, c) \
|
||||
}
|
||||
|
||||
#define FORWARD4(category, name, type1, type2, type3, type4) \
|
||||
void name(type1 a, type2 b, type3 c, type4 d) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
validateRegister(c); \
|
||||
validateRegister(d); \
|
||||
ACTUAL_FORWARD_##category(name, a, b, c, d) \
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define FORWARD_(argcount, ...) FORWARD##argcount(__VA_ARGS__)
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define FORWARD(...) FORWARD_(__VA_ARGS__)
|
||||
#else
|
||||
#define FORWARD_(argcount, ...) EXPAND_ARGS(FORWARD##argcount, (__VA_ARGS__))
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define FORWARD(...) EXPAND_ARGS(FORWARD_, (__VA_ARGS__))
|
||||
#endif
|
||||
|
||||
#define FORWARD_SSE_XMM0(name) \
|
||||
void name(const Xmm& a, const Operand& b) \
|
||||
{ \
|
||||
validateRegister(a); \
|
||||
validateRegister(b); \
|
||||
if (hasAVX) \
|
||||
actual.v##name(a, b, Xmm(0)); \
|
||||
else \
|
||||
actual.name(a, b); \
|
||||
} \
|
||||
FORWARD(4, AVX, v##name, const Xmm&, const Xmm&, const Operand&, const Xmm&)
|
||||
|
||||
#define FORWARD_JUMP(name) \
|
||||
void name(const void *addr) { actual.name(addr); } \
|
||||
void name(const Label& label, Xbyak::CodeGenerator::LabelType type = Xbyak::CodeGenerator::T_AUTO) { actual.name(label, type); } \
|
||||
void name(const char *label, Xbyak::CodeGenerator::LabelType type = Xbyak::CodeGenerator::T_AUTO) { actual.name(label, type); }
|
||||
|
||||
#define ADD_ONE_2 3
|
||||
#define ADD_ONE_3 4
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define SFORWARD(argcount, name, ...) FORWARD(argcount, SSE, name, __VA_ARGS__)
|
||||
#define AFORWARD_(argcount, name, arg1, ...) \
|
||||
SFORWARD(argcount, name, arg1, __VA_ARGS__) \
|
||||
FORWARD(ADD_ONE_##argcount, AVX, v##name, arg1, arg1, __VA_ARGS__)
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define AFORWARD(...) EXPAND_ARGS(AFORWARD_, (__VA_ARGS__))
|
||||
#else
|
||||
#define SFORWARD(argcount, name, ...) EXPAND_ARGS(FORWARD, (argcount, SSE, name, __VA_ARGS__))
|
||||
#define AFORWARD_(argcount, name, arg1, ...) \
|
||||
EXPAND_ARGS(SFORWARD, (argcount, name, arg1, __VA_ARGS__)) \
|
||||
EXPAND_ARGS(FORWARD, (ADD_ONE_##argcount, AVX, v##name, arg1, arg1, __VA_ARGS__))
|
||||
// Gets the macro evaluator to evaluate in the right order
|
||||
#define AFORWARD(...) EXPAND_ARGS(AFORWARD_, (__VA_ARGS__))
|
||||
#endif
|
||||
|
||||
#define FORWARD_OO_OI(name) \
|
||||
FORWARD(2, BASE, name, ARGS_OO) \
|
||||
FORWARD(2, BASE, name, ARGS_OI)
|
||||
|
||||
#define ARGS_OI const Operand&, uint32
|
||||
#define ARGS_OO const Operand&, const Operand&
|
||||
#define ARGS_XI const Xmm&, int
|
||||
#define ARGS_XO const Xmm&, const Operand&
|
||||
#define ARGS_XOI const Xmm&, const Operand&, uint8
|
||||
#define ARGS_XXO const Xmm&, const Xmm&, const Operand&
|
||||
|
||||
// For instructions that are ifdef'd out without XBYAK64
|
||||
#ifdef XBYAK64
|
||||
#define REQUIRE64(action) require64(); action
|
||||
#else
|
||||
#define REQUIRE64(action) require64()
|
||||
#endif
|
||||
|
||||
const uint8 *getCurr() { return actual.getCurr(); }
|
||||
void align(int x = 16) { return actual.align(x); }
|
||||
void db(int code) { actual.db(code); }
|
||||
void L(const std::string& label) { actual.L(label); }
|
||||
|
||||
void cdqe() { REQUIRE64(actual.cdqe()); }
|
||||
void ret(int imm = 0) { actual.ret(imm); }
|
||||
void vzeroupper() { requireAVX(); actual.vzeroupper(); }
|
||||
void vzeroall() { requireAVX(); actual.vzeroall(); }
|
||||
|
||||
FORWARD_OO_OI(add)
|
||||
FORWARD_OO_OI(and)
|
||||
FORWARD_OO_OI(cmp)
|
||||
FORWARD_OO_OI(or)
|
||||
FORWARD_OO_OI(sub)
|
||||
FORWARD_OO_OI(xor)
|
||||
FORWARD(2, BASE, lea, const Reg&, const Address&)
|
||||
FORWARD(2, BASE, mov, const Operand&, size_t)
|
||||
FORWARD(2, BASE, mov, ARGS_OO)
|
||||
FORWARD(2, BASE, movzx, const Reg&, const Operand&)
|
||||
FORWARD(1, BASE, not, const Operand&)
|
||||
FORWARD(1, BASE, pop, const Operand&)
|
||||
FORWARD(1, BASE, push, const Operand&)
|
||||
FORWARD(2, BASE, sar, const Operand&, const Reg8&)
|
||||
FORWARD(2, BASE, sar, ARGS_OI)
|
||||
FORWARD(2, BASE, shl, const Operand&, const Reg8&)
|
||||
FORWARD(2, BASE, shl, ARGS_OI)
|
||||
FORWARD(2, BASE, shr, const Operand&, const Reg8&)
|
||||
FORWARD(2, BASE, shr, ARGS_OI)
|
||||
FORWARD(2, BASE, test, const Operand&, const Reg&);
|
||||
FORWARD(2, BASE, test, ARGS_OI);
|
||||
|
||||
FORWARD_JUMP(je)
|
||||
FORWARD_JUMP(jle)
|
||||
FORWARD_JUMP(jmp)
|
||||
|
||||
AFORWARD(2, addps, ARGS_XO)
|
||||
SFORWARD(2, cvtdq2ps, ARGS_XO)
|
||||
SFORWARD(2, cvtps2dq, ARGS_XO)
|
||||
SFORWARD(2, cvttps2dq, ARGS_XO)
|
||||
SFORWARD(3, extractps, const Operand&, const Xmm&, uint8)
|
||||
AFORWARD(2, maxps, ARGS_XO)
|
||||
AFORWARD(2, minps, ARGS_XO)
|
||||
SFORWARD(2, movaps, ARGS_XO)
|
||||
SFORWARD(2, movaps, const Address&, const Xmm&)
|
||||
SFORWARD(2, movd, const Address&, const Xmm&)
|
||||
SFORWARD(2, movd, const Reg32&, const Xmm&)
|
||||
SFORWARD(2, movd, const Xmm&, const Address&)
|
||||
SFORWARD(2, movd, const Xmm&, const Reg32&)
|
||||
SFORWARD(2, movdqa, ARGS_XO)
|
||||
SFORWARD(2, movdqa, const Address&, const Xmm&)
|
||||
SFORWARD(2, movhps, ARGS_XO)
|
||||
SFORWARD(2, movhps, const Address&, const Xmm&)
|
||||
SFORWARD(2, movq, const Address&, const Xmm&)
|
||||
SFORWARD(2, movq, const Xmm&, const Address&)
|
||||
AFORWARD(2, mulps, ARGS_XO)
|
||||
AFORWARD(2, orps, ARGS_XO)
|
||||
AFORWARD(2, packssdw, ARGS_XO)
|
||||
AFORWARD(2, packusdw, ARGS_XO)
|
||||
AFORWARD(2, packuswb, ARGS_XO)
|
||||
AFORWARD(2, paddd, ARGS_XO)
|
||||
AFORWARD(2, paddusb, ARGS_XO)
|
||||
AFORWARD(2, paddw, ARGS_XO)
|
||||
AFORWARD(2, pand, ARGS_XO)
|
||||
AFORWARD(2, pandn, ARGS_XO)
|
||||
AFORWARD(3, pblendw, ARGS_XOI)
|
||||
AFORWARD(2, pcmpeqd, ARGS_XO)
|
||||
AFORWARD(2, pcmpeqw, ARGS_XO)
|
||||
AFORWARD(2, pcmpgtd, ARGS_XO)
|
||||
SFORWARD(3, pextrd, const Operand&, const Xmm&, uint8)
|
||||
SFORWARD(3, pextrw, const Operand&, const Xmm&, uint8)
|
||||
AFORWARD(3, pinsrd, ARGS_XOI)
|
||||
AFORWARD(2, pmaxsw, ARGS_XO)
|
||||
AFORWARD(2, pminsd, ARGS_XO)
|
||||
AFORWARD(2, pminsw, ARGS_XO)
|
||||
SFORWARD(2, pmovsxbd, ARGS_XO)
|
||||
SFORWARD(2, pmovmskb, const Reg32e&, const Xmm&)
|
||||
SFORWARD(2, pmovzxbw, ARGS_XO)
|
||||
AFORWARD(2, pmulhrsw, ARGS_XO)
|
||||
AFORWARD(2, pmulhw, ARGS_XO)
|
||||
AFORWARD(2, pmullw, ARGS_XO)
|
||||
AFORWARD(2, por, ARGS_XO)
|
||||
SFORWARD(3, pshufd, ARGS_XOI)
|
||||
SFORWARD(3, pshufhw, ARGS_XOI)
|
||||
SFORWARD(3, pshuflw, ARGS_XOI)
|
||||
AFORWARD(2, pslld, ARGS_XI)
|
||||
AFORWARD(2, psllw, ARGS_XI)
|
||||
AFORWARD(2, psrad, ARGS_XI)
|
||||
AFORWARD(2, psrad, ARGS_XO)
|
||||
AFORWARD(2, psraw, ARGS_XI)
|
||||
AFORWARD(2, psrld, ARGS_XI)
|
||||
AFORWARD(2, psrldq, ARGS_XI)
|
||||
AFORWARD(2, psrlw, ARGS_XI)
|
||||
AFORWARD(2, psrlw, ARGS_XO)
|
||||
AFORWARD(2, psubd, ARGS_XO)
|
||||
AFORWARD(2, psubw, ARGS_XO)
|
||||
AFORWARD(2, punpckhdq, ARGS_XO)
|
||||
AFORWARD(2, punpckhwd, ARGS_XO)
|
||||
AFORWARD(2, punpcklbw, ARGS_XO)
|
||||
AFORWARD(2, punpckldq, ARGS_XO)
|
||||
AFORWARD(2, punpcklqdq,ARGS_XO)
|
||||
AFORWARD(2, punpcklwd, ARGS_XO)
|
||||
AFORWARD(2, pxor, ARGS_XO)
|
||||
SFORWARD(2, rcpps, ARGS_XO)
|
||||
AFORWARD(3, shufps, ARGS_XOI)
|
||||
AFORWARD(2, subps, ARGS_XO)
|
||||
AFORWARD(2, xorps, ARGS_XO)
|
||||
|
||||
FORWARD_SSE_XMM0(pblendvb)
|
||||
|
||||
FORWARD(2, AVX, vbroadcastss, ARGS_XO)
|
||||
FORWARD(2, AVX2, vbroadcasti128, const Ymm&, const Address&)
|
||||
FORWARD(2, AVX, vbroadcastf128, const Ymm&, const Address&)
|
||||
FORWARD(3, FMA, vfmadd213ps, ARGS_XXO)
|
||||
FORWARD(3, AVX2, vextracti128, const Operand&, const Ymm&, uint8)
|
||||
FORWARD(4, AVX2, vinserti128, const Ymm&, const Ymm&, const Operand&, uint8);
|
||||
FORWARD(2, AVX2, vpbroadcastd, ARGS_XO)
|
||||
FORWARD(2, AVX2, vpbroadcastq, ARGS_XO)
|
||||
FORWARD(2, AVX2, vpbroadcastw, ARGS_XO)
|
||||
FORWARD(3, AVX2, vpermq, const Ymm&, const Operand&, uint8)
|
||||
FORWARD(3, AVX2, vpgatherdd, const Xmm&, const Address&, const Xmm&);
|
||||
FORWARD(3, AVX2, vpsravd, ARGS_XXO)
|
||||
FORWARD(3, AVX2, vpsrlvd, ARGS_XXO)
|
||||
|
||||
#undef REQUIRE64
|
||||
#undef ARGS_OI
|
||||
#undef ARGS_OO
|
||||
#undef ARGS_XI
|
||||
#undef ARGS_XO
|
||||
#undef ARGS_XOI
|
||||
#undef ARGS_XXO
|
||||
#undef FORWARD_OO_OI
|
||||
#undef AFORWARD
|
||||
#undef AFORWARD_
|
||||
#undef SFORWARD
|
||||
#undef ADD_ONE_2
|
||||
#undef ADD_ONE_3
|
||||
#undef FORWARD_SSE_XMM0
|
||||
#undef FORWARD_JUMP
|
||||
#undef FORWARD
|
||||
#undef FORWARD_
|
||||
#undef FORWARD4
|
||||
#undef FORWARD3
|
||||
#undef FORWARD2
|
||||
#undef FORWARD1
|
||||
#undef ACTUAL_FORWARD_FMA
|
||||
#undef ACTUAL_FORWARD_AVX2
|
||||
#undef ACTUAL_FORWARD_AVX
|
||||
#undef ACTUAL_FORWARD_SSE
|
||||
#undef ACTUAL_FORWARD_SSEONLY
|
||||
#undef ACTUAL_FORWARD_BASE
|
||||
#undef EXPAND_ARGS
|
||||
};
|
||||
566
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
Normal file
566
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
Normal file
@@ -0,0 +1,566 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GS/GS_types.h"
|
||||
#include "GSSetupPrimCodeGenerator.all.h"
|
||||
#include "GSVertexSW.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
#define _rip_local(field) ((is32 || m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)])
|
||||
|
||||
#define _64_m_local _64_t0
|
||||
|
||||
/// On AVX, does a v-prefixed separate destination operation
|
||||
/// On SSE, moves src1 into dst using movdqa, then does the operation
|
||||
#define THREEARG(operation, dst, src1, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (hasAVX) \
|
||||
{ \
|
||||
v##operation(dst, src1, __VA_ARGS__); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
movdqa(dst, src1); \
|
||||
operation(dst, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#define _rip_local_d(x) _rip_local(d8.x)
|
||||
#define _rip_local_d_p(x) _rip_local_d(p.x)
|
||||
#else
|
||||
#define _rip_local_d(x) _rip_local(d4.x)
|
||||
#define _rip_local_d_p(x) _rip_local_d(x)
|
||||
#endif
|
||||
|
||||
GSSetupPrimCodeGenerator2::GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key)
|
||||
: _parent(base, cpu)
|
||||
, m_local(*(GSScanlineLocalData*)param)
|
||||
, m_rip(false), many_regs(false)
|
||||
// On x86 arg registers are very temporary but on x64 they aren't, so on x86 some registers overlap
|
||||
#ifdef _WIN32
|
||||
, _64_vertex(is64 ? rcx : r8)
|
||||
, _index(is64 ? rdx : rcx)
|
||||
, _dscan(is64 ? r8 : rdx)
|
||||
, _64_t0(r9), t1(is64 ? r10 : rcx)
|
||||
#else
|
||||
, _64_vertex(is64 ? rdi : r8)
|
||||
, _index(is64 ? rsi : rcx)
|
||||
, _dscan(rdx)
|
||||
, _64_t0(is64 ? rcx : r8), t1(is64 ? r8 : rcx)
|
||||
#endif
|
||||
, _m_local(chooseLocal(&m_local, _64_m_local))
|
||||
{
|
||||
m_sel.key = key;
|
||||
|
||||
m_en.z = m_sel.zb ? 1 : 0;
|
||||
m_en.f = m_sel.fb && m_sel.fge ? 1 : 0;
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::broadcastf128(const XYm& reg, const Address& mem)
|
||||
{
|
||||
#if SETUP_PRIM_USING_YMM
|
||||
vbroadcastf128(reg, mem);
|
||||
#else
|
||||
movaps(reg, mem);
|
||||
#endif
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Generate()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
|
||||
|
||||
bool needs_shift = (m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip;
|
||||
many_regs = is64 && isYmm && !m_sel.notest && needs_shift;
|
||||
|
||||
#ifdef _WIN64
|
||||
int needs_saving = many_regs ? 6 : m_sel.notest ? 0 : 2;
|
||||
if (needs_saving)
|
||||
{
|
||||
sub(rsp, 8 + 16 * needs_saving);
|
||||
for (int i = 0; i < needs_saving; i++)
|
||||
{
|
||||
movdqa(ptr[rsp + i * 16], Xmm(i + 6));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (is64 && !m_rip)
|
||||
mov(_64_m_local, (size_t)&m_local);
|
||||
|
||||
if (needs_shift)
|
||||
{
|
||||
if (is32)
|
||||
mov(_dscan, ptr[rsp + _32_dscan]);
|
||||
|
||||
if (isXmm)
|
||||
mov(rax, (size_t)g_const->m_shift_128b);
|
||||
else
|
||||
mov(rax, (size_t)g_const->m_shift_256b);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
|
||||
{
|
||||
movaps(XYm(3 + i), ptr[rax + i * vecsize]);
|
||||
}
|
||||
}
|
||||
|
||||
if (isXmm)
|
||||
Depth_XMM();
|
||||
else
|
||||
Depth_YMM();
|
||||
|
||||
Texture();
|
||||
|
||||
Color();
|
||||
|
||||
#ifdef _WIN64
|
||||
if (needs_saving)
|
||||
{
|
||||
for (int i = 0; i < needs_saving; i++)
|
||||
{
|
||||
movdqa(Xmm(i + 6), ptr[rsp + i * 16]);
|
||||
}
|
||||
add(rsp, 8 + 16 * needs_saving);
|
||||
}
|
||||
#endif
|
||||
if (isYmm)
|
||||
vzeroupper();
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Depth_XMM()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
|
||||
movaps(xmm0, ptr[_dscan + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
THREEARG(shufps, xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
THREEARG(mulps, xmm2, xmm1, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(_rip_local_d_p(f), xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
THREEARG(mulps, xmm2, xmm1, XYm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(_rip_local(d[i].f), xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
movdqa(_rip_local_d_p(z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, XYm(4 + i));
|
||||
movdqa(_rip_local(d[i].z), xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
if (is32)
|
||||
mov(_index, ptr[rsp + _32_index]);
|
||||
mov(eax, ptr[_index + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
if (is64)
|
||||
add(rax, _64_vertex);
|
||||
else
|
||||
add(rax, ptr[rsp + _32_vertex]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(_rip_local(p.f), xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
movdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
movdqa(_rip_local(p.z), xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Depth_YMM()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
extractps(_rip_local_d_p(z), xmm1, 2);
|
||||
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
|
||||
cvtps2dq(ymm1, ymm1);
|
||||
pextrd(_rip_local_d_p(f), xmm1, 3);
|
||||
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
// Save a byte in the encoding for ymm8-11 by swapping with ymm2 (multiplication is communative)
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm0, Ymm(4 + i), ymm2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
movaps(_rip_local(d[i].z), ymm0);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
vmulps(ymm0, Ymm(4 + i), ymm1);
|
||||
else
|
||||
vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(ymm0, ymm0);
|
||||
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(_rip_local(d[i].f), ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
if (is32)
|
||||
mov(_index, ptr[rsp + _32_index]);
|
||||
mov(eax, ptr[_index + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
if (is64)
|
||||
add(rax, _64_vertex);
|
||||
else
|
||||
add(rax, ptr[rsp + _32_vertex]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
cvttps2dq(xmm0, xmm0);
|
||||
pextrd(_rip_local(p.f), xmm0, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(t1.cvt32(), ptr[rax + offsetof(GSVertexSW, t.w)]);
|
||||
mov(_rip_local(p.z), t1.cvt32());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Texture()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, t)]);
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
|
||||
movdqa(_rip_local_d(stq), xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(_rip_local_d(stq), xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
THREEARG(shufps, xym1, xym0, xym0, _MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym2, XYm(4 + i), xym1);
|
||||
else
|
||||
vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xym2, xym2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movdqa(_rip_local(d[i].s), xym2); break;
|
||||
case 1: movdqa(_rip_local(d[i].t), xym2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movaps(_rip_local(d[i].s), xym2); break;
|
||||
case 1: movaps(_rip_local(d[i].t), xym2); break;
|
||||
case 2: movaps(_rip_local(d[i].q), xym2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator2::Color()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
THREEARG(mulps, xmm1, xmm0, xmm3);
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm1, xmm1);
|
||||
if (isXmm)
|
||||
movdqa(_rip_local_d(c), xmm1);
|
||||
else
|
||||
movq(_rip_local_d(c), xmm1);
|
||||
|
||||
// xym3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
THREEARG(shufps, xym2, xym0, xym0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
THREEARG(shufps, xym3, xym0, xym0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym0, xym0);
|
||||
packssdw(xym0, xym0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym1, xym1);
|
||||
packssdw(xym1, xym1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xym0, xym1);
|
||||
movdqa(_rip_local(d[i].rb), xym0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
broadcastf128(xym0, ptr[_dscan + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
THREEARG(shufps, xym2, xym0, xym0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
THREEARG(shufps, xym3, xym0, xym0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : dsize); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym0, XYm(4 + i), xym2);
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym0, xym0);
|
||||
packssdw(xym0, xym0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
if (i < 4 || many_regs)
|
||||
THREEARG(mulps, xym1, XYm(4 + i), xym3);
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
cvttps2dq(xym1, xym1);
|
||||
packssdw(xym1, xym1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xym0, xym1);
|
||||
movdqa(_rip_local(d[i].ga), xym0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
if (is32)
|
||||
mov(_index, ptr[rsp + _32_index]);
|
||||
mov(eax, ptr[_index + sizeof(uint32) * last]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
if (is64)
|
||||
add(rax, _64_vertex);
|
||||
else
|
||||
add(rax, ptr[rsp + _32_vertex]);
|
||||
}
|
||||
|
||||
if (isXmm)
|
||||
{
|
||||
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
}
|
||||
else
|
||||
{
|
||||
vbroadcasti128(ymm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
cvttps2dq(ymm0, ymm0);
|
||||
}
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xym1, xym0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xym0, xym1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xym0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xym1, xym0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xym2, xym0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(_rip_local(c.rb), xym1);
|
||||
movdqa(_rip_local(c.ga), xym2);
|
||||
}
|
||||
}
|
||||
83
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
Normal file
83
pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h
Normal file
@@ -0,0 +1,83 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "GSScanlineEnvironment.h"
|
||||
#include "GSNewCodeGenerator.h"
|
||||
|
||||
#if _M_SSE >= 0x501
|
||||
#define SETUP_PRIM_VECTOR_REGISTER Xbyak::Ymm
|
||||
#define SETUP_PRIM_USING_XMM 0
|
||||
#define SETUP_PRIM_USING_YMM 1
|
||||
#else
|
||||
#define SETUP_PRIM_VECTOR_REGISTER Xbyak::Xmm
|
||||
#define SETUP_PRIM_USING_XMM 1
|
||||
#define SETUP_PRIM_USING_YMM 0
|
||||
#endif
|
||||
|
||||
class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator
|
||||
{
|
||||
using _parent = GSNewCodeGenerator;
|
||||
using XYm = SETUP_PRIM_VECTOR_REGISTER;
|
||||
|
||||
using Xmm = Xbyak::Xmm;
|
||||
using Ymm = Xbyak::Ymm;
|
||||
|
||||
/// On x86-64 we reserve a bunch of GPRs for holding addresses of locals that would otherwise be hard to reach
|
||||
/// On x86-32 the same values are just raw 32-bit addresses
|
||||
using LocalAddr = Choose3264<size_t, AddressReg>::type;
|
||||
|
||||
constexpr static bool isXmm = std::is_same<XYm, Xbyak::Xmm>::value;
|
||||
constexpr static bool isYmm = std::is_same<XYm, Xbyak::Ymm>::value;
|
||||
constexpr static int vecsize = isXmm ? 16 : 32;
|
||||
|
||||
constexpr static int dsize = isXmm ? 4 : 8;
|
||||
|
||||
constexpr static int _32_args = 0;
|
||||
constexpr static int _invalid = 0xaaaaaaaa;
|
||||
constexpr static int _32_vertex = is64 ? _invalid : _32_args + 4;
|
||||
constexpr static int _32_index = is64 ? _invalid : _32_args + 8;
|
||||
constexpr static int _32_dscan = is64 ? _invalid : _32_args + 12;
|
||||
|
||||
GSScanlineSelector m_sel;
|
||||
GSScanlineLocalData& m_local;
|
||||
bool m_rip;
|
||||
bool many_regs;
|
||||
|
||||
struct {uint32 z:1, f:1, t:1, c:1;} m_en;
|
||||
|
||||
const XYm xym0{0}, xym1{1}, xym2{2}, xym3{3}, xym4{4}, xym5{5}, xym6{6}, xym7{7}, xym8{8}, xym9{9}, xym10{10}, xym11{11}, xym12{12}, xym13{13}, xym14{14}, xym15{15};
|
||||
const AddressReg _64_vertex, _index, _dscan, _64_t0, t1;
|
||||
const LocalAddr _m_local;
|
||||
/// Returns the first arg on 32-bit, second on 64-bit
|
||||
static LocalAddr chooseLocal(const void* addr32, AddressReg reg64)
|
||||
{
|
||||
return choose3264((size_t)addr32, reg64);
|
||||
}
|
||||
|
||||
public:
|
||||
GSSetupPrimCodeGenerator2(Xbyak::CodeGenerator* base, CPUInfo cpu, void* param, uint64 key);
|
||||
void Generate();
|
||||
|
||||
private:
|
||||
/// Broadcast 128 bits of floats from memory to the whole register, whatever size that register might be
|
||||
void broadcastf128(const XYm& reg, const Xbyak::Address& mem);
|
||||
|
||||
void Depth_XMM();
|
||||
void Depth_YMM();
|
||||
void Texture();
|
||||
void Color();
|
||||
};
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSSetupPrimCodeGenerator.all.h"
|
||||
|
||||
using namespace Xbyak;
|
||||
|
||||
@@ -30,19 +31,5 @@ GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void
|
||||
m_en.t = m_sel.fb && m_sel.tfx != TFX_NONE ? 1 : 0;
|
||||
m_en.c = m_sel.fb && !(m_sel.tfx == TFX_DECAL && m_sel.tcc) ? 1 : 0;
|
||||
|
||||
try
|
||||
{
|
||||
#if _M_SSE >= 0x501
|
||||
Generate_AVX2();
|
||||
#else
|
||||
if (m_cpu.has(util::Cpu::tAVX))
|
||||
Generate_AVX();
|
||||
else
|
||||
Generate_SSE();
|
||||
#endif
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
fprintf(stderr, "ERR:GSSetupPrimCodeGenerator %s\n", e.what());
|
||||
}
|
||||
GSSetupPrimCodeGenerator2(this, CPUInfo(m_cpu), param, key).Generate();
|
||||
}
|
||||
|
||||
@@ -32,23 +32,6 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
|
||||
uint32 z : 1, f : 1, t : 1, c : 1;
|
||||
} m_en;
|
||||
|
||||
#if _M_SSE < 0x501
|
||||
void Generate_SSE();
|
||||
void Depth_SSE();
|
||||
void Texture_SSE();
|
||||
void Color_SSE();
|
||||
|
||||
void Generate_AVX();
|
||||
void Depth_AVX();
|
||||
void Texture_AVX();
|
||||
void Color_AVX();
|
||||
#else
|
||||
void Generate_AVX2();
|
||||
void Depth_AVX2();
|
||||
void Texture_AVX2();
|
||||
void Color_AVX2();
|
||||
#endif
|
||||
|
||||
public:
|
||||
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
|
||||
};
|
||||
|
||||
@@ -1,365 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
|
||||
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
|
||||
|
||||
#ifdef _WIN64
|
||||
sub(rsp, 8 + 2 * 16);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
#endif
|
||||
|
||||
if (!m_rip)
|
||||
mov(t0, (size_t)&m_local);
|
||||
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)g_const->m_shift_128b);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_AVX();
|
||||
|
||||
Texture_AVX();
|
||||
|
||||
Color_AVX();
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
add(rsp, 8 + 2 * 16);
|
||||
#endif
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(_rip_local(d4.f), xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].f, variableOffset), xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(_rip_local(d4.z), xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].z, variableOffset), xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
vmovaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(_rip_local(p.f), xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vmovdqa(_rip_local(p.z), xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
|
||||
vmovdqa(_rip_local(d4.stq), xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(_rip_local(d4.stq), xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
|
||||
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), xmm2); break;
|
||||
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), xmm2); break;
|
||||
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(_rip_local(d4.c), xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].rb, variableOffset), xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].ga, variableOffset), xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * last]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
}
|
||||
|
||||
vcvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(_rip_local(c.rb), xmm1);
|
||||
vmovdqa(_rip_local(c.ga), xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,368 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE >= 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offsetof(GSScanlineLocalData, field)])
|
||||
#define _rip_local_v(field, offset) (m_rip ? ptr[rip + &m_local.field] : ptr[t0 + offset])
|
||||
|
||||
#define _m_shift(i) (Ymm(7 + i))
|
||||
|
||||
// FIXME windows ?
|
||||
#define _vertex rcx
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX2()
|
||||
{
|
||||
// Technically we just need the delta < 2GB
|
||||
m_rip = (size_t)&m_local < 0x80000000 && (size_t)getCurr() < 0x80000000;
|
||||
|
||||
#ifdef _WIN64
|
||||
sub(rsp, 8 + 2 * 16);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], ymm6);
|
||||
vmovdqa(ptr[rsp + 16], ymm7);
|
||||
#endif
|
||||
|
||||
if (!m_rip)
|
||||
mov(t0, (size_t)&m_local);
|
||||
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)g_const->m_shift_256b);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 9); i++)
|
||||
{
|
||||
vmovaps(_m_shift(i), ptr[rax + i * 32]);
|
||||
}
|
||||
}
|
||||
// ymm7 to ymm 15 = m_shift[i]
|
||||
|
||||
Depth_AVX2();
|
||||
|
||||
Texture_AVX2();
|
||||
|
||||
Color_AVX2();
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(ymm6, ptr[rsp + 0]);
|
||||
vmovdqa(ymm7, ptr[rsp + 16]);
|
||||
|
||||
add(rsp, 8 + 2 * 16);
|
||||
#endif
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX2()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
const Ymm& dscan_p = ymm6;
|
||||
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
vbroadcastf128(dscan_p, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, dscan_p, _m_shift(0));
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
vextractps(_rip_local(d8.p.z), xmm1, 2);
|
||||
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, dscan_p, dscan_p, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
vmulps(ymm0, ymm2, _m_shift(1 + i));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovaps(_rip_local_v(d[i].z, variableOffset), ymm0);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
|
||||
// FIXME no truncate ? why ? vcvttps2dq ?
|
||||
//vcvtps2dq(ymm2, ymm1); // let's guess a typo
|
||||
vcvttps2dq(ymm2, ymm1);
|
||||
vpextrd(_rip_local(d8.p.f), xmm2, 3);
|
||||
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm3, dscan_p, dscan_p, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(ymm0, ymm3, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].f, variableOffset), ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * 1]);
|
||||
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
|
||||
add(_vertex, a0);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
vmovaps(xmm0, ptr[_vertex + offsetof(GSVertexSW, p)]);
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpextrd(_rip_local(p.f), xmm0, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
|
||||
mov(_rip_local(p.z), eax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX2()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector8 dt(dscan.t);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||
|
||||
// GSVector8 dt8 = dt * shift[0];
|
||||
|
||||
vmulps(ymm1, ymm0, _m_shift(0));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.84.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
|
||||
vmovdqa(_rip_local(d8.stq), xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d8.stq = t * 4.0f;
|
||||
|
||||
vmovaps(_rip_local(d8.stq), xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8 v = dstq * shift[1 + i];
|
||||
|
||||
vmulps(ymm2, ymm1, _m_shift(1 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
|
||||
|
||||
vcvttps2dq(ymm2, ymm2);
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
|
||||
case 1: vmovdqa(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(_rip_local_v(d[i].s, variableOffsetS), ymm2); break;
|
||||
case 1: vmovaps(_rip_local_v(d[i].t, variableOffsetT), ymm2); break;
|
||||
case 2: vmovaps(_rip_local_v(d[i].q, variableOffsetQ), ymm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX2()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
const Ymm& dscan_c = ymm6;
|
||||
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(dscan_c, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d8.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(ymm1, dscan_c, ymm3);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(ymm1, ymm1);
|
||||
vmovq(_rip_local(d8.c), xmm1);
|
||||
|
||||
// GSVector8 dr = dc.xxxx();
|
||||
// GSVector8 db = dc.zzzz();
|
||||
|
||||
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm0, ymm2, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm1, ymm3, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].rb, variableOffset), ymm0);
|
||||
}
|
||||
|
||||
// GSVector8 dg = dc.yyyy();
|
||||
// GSVector8 da = dc.wwww();
|
||||
|
||||
vshufps(ymm2, dscan_c, dscan_c, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(ymm3, dscan_c, dscan_c, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm0, ymm2, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
|
||||
|
||||
vmulps(ymm1, ymm3, _m_shift(1 + i));
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
vmovdqa(_rip_local_v(d[i].ga, variableOffset), ymm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(_vertex.cvt32(), ptr[a1 + sizeof(uint32) * last]);
|
||||
shl(_vertex.cvt32(), 6); // * sizeof(GSVertexSW)
|
||||
add(_vertex, a0);
|
||||
}
|
||||
|
||||
vbroadcasti128(ymm0, ptr[_vertex + offsetof(GSVertexSW, c)]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(ymm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(_rip_local(c.rb), ymm1);
|
||||
vmovdqa(_rip_local(c.ga), ymm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,374 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_SSE()
|
||||
{
|
||||
#ifdef _WIN64
|
||||
sub(rsp, 8 + 2 * 16);
|
||||
|
||||
vmovdqa(ptr[rsp + 0], xmm6);
|
||||
vmovdqa(ptr[rsp + 16], xmm7);
|
||||
#endif
|
||||
|
||||
mov(t0, (size_t)&m_local);
|
||||
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(rax, (size_t)g_const->m_shift_128b[0]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[rax + i * 16]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_SSE();
|
||||
|
||||
Texture_SSE();
|
||||
|
||||
Color_SSE();
|
||||
|
||||
#ifdef _WIN64
|
||||
vmovdqa(xmm6, ptr[rsp + 0]);
|
||||
vmovdqa(xmm7, ptr[rsp + 16]);
|
||||
|
||||
add(rsp, 8 + 2 * 16);
|
||||
#endif
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_SSE()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.f)], xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].f) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.z)], xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].z) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * 1]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
|
||||
movaps(xmm0, ptr[rax + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.f)], xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
vmovdqa(xmm0, ptr[rax + offsetof(GSVertexSW, t)]);
|
||||
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vmovdqa(ptr[t0 + offsetof(GSScanlineLocalData, p.z)], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_SSE()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, t)]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[t0 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movdqa(ptr[t0 + variableOffsetS], xmm2); break;
|
||||
case 1: movdqa(ptr[t0 + variableOffsetT], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
const size_t variableOffsetS = offsetof(GSScanlineLocalData, d[0].s) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetT = offsetof(GSScanlineLocalData, d[0].t) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
const size_t variableOffsetQ = offsetof(GSScanlineLocalData, d[0].q) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movaps(ptr[t0 + variableOffsetS], xmm2); break;
|
||||
case 1: movaps(ptr[t0 + variableOffsetT], xmm2); break;
|
||||
case 2: movaps(ptr[t0 + variableOffsetQ], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_SSE()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, d4.c)], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].rb) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[a2 + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
|
||||
const size_t variableOffset = offsetof(GSScanlineLocalData, d[0].ga) + (i * sizeof(GSScanlineLocalData::d[0]));
|
||||
movdqa(ptr[t0 + variableOffset], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(eax, ptr[a1 + sizeof(uint32) * last]);
|
||||
shl(eax, 6); // * sizeof(GSVertexSW)
|
||||
add(rax, a0);
|
||||
}
|
||||
|
||||
cvttps2dq(xmm0, ptr[rax + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.rb)], xmm1);
|
||||
movdqa(ptr[t0 + offsetof(GSScanlineLocalData, c.ga)], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,335 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX()
|
||||
{
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_AVX();
|
||||
|
||||
Texture_AVX();
|
||||
|
||||
Color_AVX();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, xmm3);
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vmovdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
vmulps(xmm1, xmm0, Xmm(4 + i));
|
||||
vmovdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
vcvttps2dq(xmm1, xmm0);
|
||||
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
vmovdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
|
||||
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vmovdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
|
||||
vmovdqa(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
vmovaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
vmulps(xmm2, xmm1, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
vcvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].t], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
vmulps(xmm1, xmm0, xmm3);
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(xmm1, xmm1);
|
||||
vmovdqa(ptr[&m_local.d4.c], xmm1);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], xmm0);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
vmovaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm0, xmm2, Xmm(4 + i));
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpackssdw(xmm0, xmm0);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
vmulps(xmm1, xmm3, Xmm(4 + i));
|
||||
vcvttps2dq(xmm1, xmm1);
|
||||
vpackssdw(xmm1, xmm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], xmm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
vcvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], xmm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,360 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE >= 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_AVX2()
|
||||
{
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
vmovaps(Ymm(3 + i), ptr[g_const->m_shift_256b[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_AVX2();
|
||||
|
||||
Texture_AVX2();
|
||||
|
||||
Color_AVX2();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_AVX2()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 dp8 = dscan.p * GSVector4::broadcast32(&shift[0]);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d8.p.z = dp8.extract32<2>();
|
||||
|
||||
vextractps(ptr[&m_local.d8.p.z], xmm1, 2);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d8.p.f = GSVector4i(dp8).extract32<3>();
|
||||
|
||||
vcvtps2dq(ymm2, ymm1);
|
||||
vpextrd(ptr[&m_local.d8.p.f], xmm2, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector8 dz = GSVector8(dscan.p).zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector8 df = GSVector8(dscan.p).wwww();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.d[i].z = dz * shift[1 + i];
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vmovaps(ptr[&m_local.d[i].z], ymm0);
|
||||
}
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm1, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
vmovdqa(ptr[&m_local.d[i].f], ymm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(vertex[index[1]].p).extract32<3>();
|
||||
|
||||
vmovaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
vcvttps2dq(xmm0, xmm0);
|
||||
vpextrd(ptr[&m_local.p.f], xmm0, 3);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// m_local.p.z = vertex[index[1]].t.u32[3]; // uint32 z is bypassed in t.w
|
||||
|
||||
mov(eax, ptr[ecx + offsetof(GSVertexSW, t.w)]);
|
||||
mov(ptr[&m_local.p.z], eax);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_AVX2()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector8 dt(dscan.t);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
// GSVector8 dt8 = dt * shift[0];
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d8.stq = GSVector8::cast(GSVector8i(dt8));
|
||||
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
|
||||
vmovdqa(ptr[&m_local.d8.stq], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d8.stq = dt8;
|
||||
|
||||
vmovaps(ptr[&m_local.d8.stq], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector8 dstq = dt.xxxx/yyyy/zzzz();
|
||||
|
||||
vshufps(ymm1, ymm0, ymm0, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8 v = dstq * shift[1 + i];
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm2, ymm1, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector8::cast(GSVector8i(v));
|
||||
|
||||
vcvttps2dq(ymm2, ymm2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovdqa(ptr[&m_local.d[i].s], ymm2); break;
|
||||
case 1: vmovdqa(ptr[&m_local.d[i].t], ymm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: vmovaps(ptr[&m_local.d[i].s], ymm2); break;
|
||||
case 1: vmovaps(ptr[&m_local.d[i].t], ymm2); break;
|
||||
case 2: vmovaps(ptr[&m_local.d[i].q], ymm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_AVX2()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// m_local.d8.c = GSVector8i(dc * shift[0]).xzyw().ps32();
|
||||
|
||||
vmulps(ymm1, ymm0, ymm3);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpshufd(ymm1, ymm1, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
vpackssdw(ymm1, ymm1);
|
||||
vmovq(ptr[&m_local.d8.c], xmm1);
|
||||
|
||||
// ymm3 is not needed anymore
|
||||
|
||||
// GSVector8 dr = dc.xxxx();
|
||||
// GSVector8 db = dc.zzzz();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm1, ymm3, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
vmovdqa(ptr[&m_local.d[i].rb], ymm0);
|
||||
}
|
||||
|
||||
// GSVector8 dc(dscan.c);
|
||||
|
||||
vbroadcastf128(ymm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
|
||||
// GSVector8 dg = dc.yyyy();
|
||||
// GSVector8 da = dc.wwww();
|
||||
|
||||
vshufps(ymm2, ymm0, ymm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
vshufps(ymm3, ymm0, ymm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 8); i++)
|
||||
{
|
||||
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm0, ymm2, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
vpackssdw(ymm0, ymm0);
|
||||
|
||||
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
|
||||
|
||||
if (i < 4)
|
||||
vmulps(ymm1, ymm3, Ymm(4 + i));
|
||||
else
|
||||
vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
|
||||
vcvttps2dq(ymm1, ymm1);
|
||||
vpackssdw(ymm1, ymm1);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
vmovdqa(ptr[&m_local.d[i].ga], ymm0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector8i c = GSVector8i(GSVector8(vertex[index[last]].c));
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
vbroadcasti128(ymm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
vcvttps2dq(ymm0, ymm0);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vpunpcklwd(ymm0, ymm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
vpsrlw(ymm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
vpshufd(ymm1, ymm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
vpshufd(ymm2, ymm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
vmovdqa(ptr[&m_local.c.rb], ymm1);
|
||||
vmovdqa(ptr[&m_local.c.ga], ymm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,350 +0,0 @@
|
||||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2021 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "GSSetupPrimCodeGenerator.h"
|
||||
#include "GSVertexSW.h"
|
||||
#include "GS/GS_codegen.h"
|
||||
|
||||
#if _M_SSE < 0x501 && !(defined(_M_AMD64) || defined(_WIN64))
|
||||
|
||||
static const int _args = 0;
|
||||
static const int _vertex = _args + 4;
|
||||
static const int _index = _args + 8;
|
||||
static const int _dscan = _args + 12;
|
||||
|
||||
void GSSetupPrimCodeGenerator::Generate_SSE()
|
||||
{
|
||||
if ((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
|
||||
{
|
||||
mov(edx, dword[esp + _dscan]);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 2 : 5); i++)
|
||||
{
|
||||
movaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
|
||||
}
|
||||
}
|
||||
|
||||
Depth_SSE();
|
||||
|
||||
Texture_SSE();
|
||||
|
||||
Color_SSE();
|
||||
|
||||
ret();
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Depth_SSE()
|
||||
{
|
||||
if (!m_en.z && !m_en.f)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.prim != GS_SPRITE_CLASS)
|
||||
{
|
||||
// GSVector4 p = dscan.p;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// GSVector4 df = p.wwww();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
// m_local.d4.f = GSVector4i(df * 4.0f).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d4.f], xmm2);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
movdqa(ptr[&m_local.d[i].f], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// GSVector4 dz = p.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
// m_local.d4.z = dz * 4.0f;
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
movdqa(ptr[&m_local.d4.z], xmm1);
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// m_local.d[i].z = dz * m_shift[i];
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, Xmm(4 + i));
|
||||
movdqa(ptr[&m_local.d[i].z], xmm1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4 p = vertex[index[1]].p;
|
||||
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * 1]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
|
||||
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, p)]);
|
||||
|
||||
if (m_en.f)
|
||||
{
|
||||
// m_local.p.f = GSVector4i(p).zzzzh().zzzz();
|
||||
|
||||
cvttps2dq(xmm1, xmm0);
|
||||
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
movdqa(ptr[&m_local.p.f], xmm1);
|
||||
}
|
||||
|
||||
if (m_en.z)
|
||||
{
|
||||
// uint32 z is bypassed in t.w
|
||||
|
||||
movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
|
||||
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
movdqa(ptr[&m_local.p.z], xmm0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Texture_SSE()
|
||||
{
|
||||
if (!m_en.t)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// GSVector4 t = dscan.t;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, t)]);
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
mulps(xmm1, xmm3);
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d4.stq = GSVector4i(t * 4.0f);
|
||||
|
||||
cvttps2dq(xmm1, xmm1);
|
||||
|
||||
movdqa(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d4.stq = t * 4.0f;
|
||||
|
||||
movaps(ptr[&m_local.d4.stq], xmm1);
|
||||
}
|
||||
|
||||
for (int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
|
||||
{
|
||||
// GSVector4 ds = t.xxxx();
|
||||
// GSVector4 dt = t.yyyy();
|
||||
// GSVector4 dq = t.zzzz();
|
||||
|
||||
movaps(xmm1, xmm0);
|
||||
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4 v = ds/dt * m_shift[i];
|
||||
|
||||
movaps(xmm2, xmm1);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
|
||||
if (m_sel.fst)
|
||||
{
|
||||
// m_local.d[i].s/t = GSVector4i(v);
|
||||
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movdqa(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movdqa(ptr[&m_local.d[i].t], xmm2); break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// m_local.d[i].s/t/q = v;
|
||||
|
||||
switch (j)
|
||||
{
|
||||
case 0: movaps(ptr[&m_local.d[i].s], xmm2); break;
|
||||
case 1: movaps(ptr[&m_local.d[i].t], xmm2); break;
|
||||
case 2: movaps(ptr[&m_local.d[i].q], xmm2); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void GSSetupPrimCodeGenerator::Color_SSE()
|
||||
{
|
||||
if (!m_en.c)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_sel.iip)
|
||||
{
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]);
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// m_local.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, xmm3);
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
packssdw(xmm2, xmm2);
|
||||
movdqa(ptr[&m_local.d4.c], xmm2);
|
||||
|
||||
// xmm3 is not needed anymore
|
||||
|
||||
// GSVector4 dr = c.xxxx();
|
||||
// GSVector4 db = c.zzzz();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].rb = r.upl16(b);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].rb], xmm2);
|
||||
}
|
||||
|
||||
// GSVector4 c = dscan.c;
|
||||
|
||||
movaps(xmm0, ptr[edx + offsetof(GSVertexSW, c)]); // not enough regs, have to reload it
|
||||
movaps(xmm1, xmm0);
|
||||
|
||||
// GSVector4 dg = c.yyyy();
|
||||
// GSVector4 da = c.wwww();
|
||||
|
||||
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
||||
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
for (int i = 0; i < (m_sel.notest ? 1 : 4); i++)
|
||||
{
|
||||
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm2, xmm0);
|
||||
mulps(xmm2, Xmm(4 + i));
|
||||
cvttps2dq(xmm2, xmm2);
|
||||
packssdw(xmm2, xmm2);
|
||||
|
||||
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
|
||||
|
||||
movaps(xmm3, xmm1);
|
||||
mulps(xmm3, Xmm(4 + i));
|
||||
cvttps2dq(xmm3, xmm3);
|
||||
packssdw(xmm3, xmm3);
|
||||
|
||||
// m_local.d[i].ga = g.upl16(a);
|
||||
|
||||
punpcklwd(xmm2, xmm3);
|
||||
movdqa(ptr[&m_local.d[i].ga], xmm2);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// GSVector4i c = GSVector4i(vertex[index[last].c);
|
||||
|
||||
int last = 0;
|
||||
|
||||
switch (m_sel.prim)
|
||||
{
|
||||
case GS_POINT_CLASS: last = 0; break;
|
||||
case GS_LINE_CLASS: last = 1; break;
|
||||
case GS_TRIANGLE_CLASS: last = 2; break;
|
||||
case GS_SPRITE_CLASS: last = 1; break;
|
||||
}
|
||||
|
||||
if (!(m_sel.prim == GS_SPRITE_CLASS && (m_en.z || m_en.f))) // if this is a sprite, the last vertex was already loaded in Depth()
|
||||
{
|
||||
mov(ecx, ptr[esp + _index]);
|
||||
mov(ecx, ptr[ecx + sizeof(uint32) * last]);
|
||||
shl(ecx, 6); // * sizeof(GSVertexSW)
|
||||
add(ecx, ptr[esp + _vertex]);
|
||||
}
|
||||
|
||||
cvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
|
||||
|
||||
// c = c.upl16(c.zwxy());
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
punpcklwd(xmm0, xmm1);
|
||||
|
||||
// if(!tme) c = c.srl16(7);
|
||||
|
||||
if (m_sel.tfx == TFX_NONE)
|
||||
{
|
||||
psrlw(xmm0, 7);
|
||||
}
|
||||
|
||||
// m_local.c.rb = c.xxxx();
|
||||
// m_local.c.ga = c.zzzz();
|
||||
|
||||
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
||||
|
||||
movdqa(ptr[&m_local.c.rb], xmm1);
|
||||
movdqa(ptr[&m_local.c.ga], xmm2);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -466,12 +466,8 @@
|
||||
<ClCompile Include="GS\GSDrawingContext.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanline.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSNewCodeGenerator.cpp" />
|
||||
<ClCompile Include="GS\GSDump.cpp" />
|
||||
<ClCompile Include="GS\Renderers\Common\GSFunctionMap.cpp" />
|
||||
<ClCompile Include="GS\Renderers\HW\GSHwHack.cpp" />
|
||||
@@ -490,12 +486,7 @@
|
||||
<ClCompile Include="GS\Window\GSSetting.cpp" />
|
||||
<ClCompile Include="GS\Window\GSSettingsDlg.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx2.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.cpp" />
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.cpp" />
|
||||
<ClCompile Include="GS\Renderers\OpenGL\GSShaderOGL.cpp" />
|
||||
<ClCompile Include="GS\GSState.cpp" />
|
||||
<ClCompile Include="GS\GSTables.cpp" />
|
||||
@@ -815,7 +806,6 @@
|
||||
<ClInclude Include="GS\Renderers\OpenGL\GLLoader.h" />
|
||||
<ClInclude Include="GS\Renderers\OpenGL\GLState.h" />
|
||||
<ClInclude Include="GS\GS.h" />
|
||||
<ClInclude Include="GS\GS_codegen.h" />
|
||||
<ClInclude Include="GS\GS_types.h" />
|
||||
<ClInclude Include="GS\GSAlignedClass.h" />
|
||||
<ClInclude Include="GS\GSBlock.h" />
|
||||
@@ -834,6 +824,8 @@
|
||||
<ClInclude Include="GS\GSDrawingEnvironment.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanline.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h" />
|
||||
<ClInclude Include="GS\GSDump.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSFastList.h" />
|
||||
<ClInclude Include="GS\Renderers\Common\GSFunctionMap.h" />
|
||||
@@ -853,6 +845,7 @@
|
||||
<ClInclude Include="GS\Window\GSSetting.h" />
|
||||
<ClInclude Include="GS\Window\GSSettingsDlg.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.h" />
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h" />
|
||||
<ClInclude Include="GS\Renderers\OpenGL\GSShaderOGL.h" />
|
||||
<ClInclude Include="GS\GSState.h" />
|
||||
<ClInclude Include="GS\GSTables.h" />
|
||||
|
||||
@@ -1517,22 +1517,10 @@
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx.cpp">
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x64.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.x86.cpp">
|
||||
<ClCompile Include="GS\Renderers\SW\GSNewCodeGenerator.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSRendererSW.cpp">
|
||||
@@ -1541,24 +1529,6 @@
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x64.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.avx2.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.x86.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="GS\Renderers\SW\GSTextureCacheSW.cpp">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClCompile>
|
||||
@@ -2508,9 +2478,6 @@
|
||||
<ClInclude Include="GS\GS.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GS_codegen.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\GS_types.h">
|
||||
<Filter>System\Ps2\GS</Filter>
|
||||
</ClInclude>
|
||||
@@ -2631,6 +2598,12 @@
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSDrawScanlineCodeGenerator.all.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSNewCodeGenerator.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSRendererSW.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
@@ -2640,6 +2613,9 @@
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.all.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="GS\Renderers\SW\GSTextureCacheSW.h">
|
||||
<Filter>System\Ps2\GS\Renderers\Software</Filter>
|
||||
</ClInclude>
|
||||
|
||||
Reference in New Issue
Block a user