Files
archived-pcsx2/pcsx2/GS/GSClut.cpp
lightningterror a8f3ec8184 GSClut: Get rid of m_read.dirty assert in GetAlphaMinMax32.
Replace it with a log.
2025-05-21 03:40:22 +02:00

929 lines
24 KiB
C++

// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
// SPDX-License-Identifier: GPL-3.0+
#include "GS/GSClut.h"
#include "GS/GSExtra.h"
#include "GS/GSLocalMemory.h"
#include "GS/GSGL.h"
#include "GS/GSUtil.h"
#include "GS/Renderers/Common/GSDevice.h"
#include "GS/Renderers/Common/GSRenderer.h"
#include "common/AlignedMalloc.h"
GSClut::GSClut(GSLocalMemory* mem)
: m_mem(mem)
{
// 1k + 1k for mirrored area simulating wrapping memory
m_clut = static_cast<u16*>(_aligned_malloc(CLUT_ALLOC_SIZE, VECTOR_ALIGNMENT));
if (!m_clut)
pxFailRel("Failed to allocate CLUT storage.");
m_buff32 = reinterpret_cast<u32*>(reinterpret_cast<u8*>(m_clut) + 2048); // 1k
m_buff64 = reinterpret_cast<u64*>(reinterpret_cast<u8*>(m_clut) + 4096); // 2k
m_write.dirty = 1;
m_read.dirty = true;
for (int i = 0; i < 16; i++)
{
for (int j = 0; j < 64; j++)
{
// The GS seems to check the lower 3 bits to tell if the format is 8/4bit
// for the reload.
const bool eight_bit = (j & 0x7) == 0x3;
const bool four_bit = (j & 0x7) == 0x4;
switch (i)
{
case PSMCT32:
case PSMCT24: // undocumented (KH?)
if (eight_bit)
m_wc[0][i][j] = &GSClut::WriteCLUT32_I8_CSM1;
else if (four_bit)
m_wc[0][i][j] = &GSClut::WriteCLUT32_I4_CSM1;
else
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
break;
case PSMCT16:
if (eight_bit)
m_wc[0][i][j] = &GSClut::WriteCLUT16_I8_CSM1;
else if (four_bit)
m_wc[0][i][j] = &GSClut::WriteCLUT16_I4_CSM1;
else
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
break;
case PSMCT16S:
if (eight_bit)
m_wc[0][i][j] = &GSClut::WriteCLUT16S_I8_CSM1;
else if (four_bit)
m_wc[0][i][j] = &GSClut::WriteCLUT16S_I4_CSM1;
else
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
break;
default:
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
}
// TODO: test this
m_wc[1][i][j] = &GSClut::WriteCLUT_NULL;
}
}
m_wc[1][PSMCT32][PSMT8] = &GSClut::WriteCLUT32_CSM2<256>;
m_wc[1][PSMCT32][PSMT8H] = &GSClut::WriteCLUT32_CSM2<256>;
m_wc[1][PSMCT32][PSMT4] = &GSClut::WriteCLUT32_CSM2<16>;
m_wc[1][PSMCT32][PSMT4HL] = &GSClut::WriteCLUT32_CSM2<16>;
m_wc[1][PSMCT32][PSMT4HH] = &GSClut::WriteCLUT32_CSM2<16>;
m_wc[1][PSMCT24][PSMT8] = &GSClut::WriteCLUT32_CSM2<256>;
m_wc[1][PSMCT24][PSMT8H] = &GSClut::WriteCLUT32_CSM2<256>;
m_wc[1][PSMCT24][PSMT4] = &GSClut::WriteCLUT32_CSM2<16>;
m_wc[1][PSMCT24][PSMT4HL] = &GSClut::WriteCLUT32_CSM2<16>;
m_wc[1][PSMCT24][PSMT4HH] = &GSClut::WriteCLUT32_CSM2<16>;
m_wc[1][PSMCT16][PSMT8] = &GSClut::WriteCLUT16_CSM2<256>;
m_wc[1][PSMCT16][PSMT8H] = &GSClut::WriteCLUT16_CSM2<256>;
m_wc[1][PSMCT16][PSMT4] = &GSClut::WriteCLUT16_CSM2<16>;
m_wc[1][PSMCT16][PSMT4HL] = &GSClut::WriteCLUT16_CSM2<16>;
m_wc[1][PSMCT16][PSMT4HH] = &GSClut::WriteCLUT16_CSM2<16>;
m_wc[1][PSMCT16S][PSMT8] = &GSClut::WriteCLUT16S_CSM2<256>;
m_wc[1][PSMCT16S][PSMT8H] = &GSClut::WriteCLUT16S_CSM2<256>;
m_wc[1][PSMCT16S][PSMT4] = &GSClut::WriteCLUT16S_CSM2<16>;
m_wc[1][PSMCT16S][PSMT4HL] = &GSClut::WriteCLUT16S_CSM2<16>;
m_wc[1][PSMCT16S][PSMT4HH] = &GSClut::WriteCLUT16S_CSM2<16>;
}
GSClut::~GSClut()
{
delete m_gpu_clut4;
delete m_gpu_clut8;
_aligned_free(m_clut);
}
u8 GSClut::IsInvalid()
{
return m_write.dirty;
}
void GSClut::ClearDrawInvalidity()
{
if (m_write.dirty & 2)
{
m_write.dirty = 1;
}
}
u32 GSClut::GetCLUTCBP()
{
return m_write.TEX0.CBP;
}
u32 GSClut::GetCLUTCPSM()
{
return m_write.TEX0.CPSM;
}
void GSClut::SetNextCLUTTEX0(u64 TEX0)
{
m_write.next_tex0 = TEX0;
}
void GSClut::Reset()
{
std::memset(m_CBP, 0, sizeof(m_CBP));
std::memset(m_clut, 0, CLUT_ALLOC_SIZE);
m_write = {};
m_write.dirty = 1;
m_read = {};
m_read.dirty = true;
}
bool GSClut::InvalidateRange(u32 start_block, u32 end_block, bool is_draw)
{
if (m_write.dirty & 2)
return m_write.dirty;
GIFRegTEX0 next_cbp;
next_cbp.U64 = m_write.next_tex0;
// Handle wrapping writes. Star Wars Battlefront 2 does this.
if ((end_block & 0xFFE0) < (start_block & 0xFFE0))
{
if ((next_cbp.CBP + 3U) <= end_block)
next_cbp.CBP += 0x4000;
end_block += 0x4000;
}
if ((next_cbp.CBP + 3U) >= start_block && end_block >= next_cbp.CBP)
{
m_write.dirty |= is_draw ? 2 : 1;
}
return m_write.dirty;
}
bool GSClut::CanLoadCLUT(const GIFRegTEX0& TEX0, const bool update_CBP)
{
if ((TEX0.PSM & 0x7) < 3)
return false;
switch (TEX0.CLD)
{
case 0:
case 6: // FFX2 menu.
case 7: // Ford Mustang Racing, Bouken Jidai Katsugeki Goemon.
return false;
case 1:
break;
case 2:
if (update_CBP)
m_CBP[0] = TEX0.CBP;
break;
case 3:
if (update_CBP)
m_CBP[1] = TEX0.CBP;
break;
case 4:
if (m_CBP[0] == TEX0.CBP)
return false;
if(update_CBP)
m_CBP[0] = TEX0.CBP;
break;
case 5:
if (m_CBP[1] == TEX0.CBP)
return false;
if (update_CBP)
m_CBP[1] = TEX0.CBP;
break;
default:
ASSUME(0);
}
return true;
}
bool GSClut::WriteTest(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
// Check if PSM is an indexed format BEFORE the load condition, updating CBP0/1 on an invalid format is not allowed
// and can break games. Corvette (NTSC) is a good example of this.
if ((TEX0.PSM & 0x7) < 3)
return false;
if (!CanLoadCLUT(TEX0, true))
return false;
// CLUT only reloads if PSM is a valid index type, avoid unnecessary flushes.
return m_write.IsDirty(TEX0, TEXCLUT);
}
void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
m_write.TEX0 = TEX0;
m_write.TEXCLUT = TEXCLUT;
m_read.dirty = true;
m_write.dirty = 0;
(this->*m_wc[TEX0.CSM][TEX0.CPSM][TEX0.PSM])(TEX0, TEXCLUT);
}
void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
ALIGN_STACK(32);
WriteCLUT_T32_I8_CSM1((u32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut, (TEX0.CSA & 15));
}
void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
ALIGN_STACK(32);
WriteCLUT_T32_I4_CSM1((u32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut + ((TEX0.CSA & 15) << 4));
}
void GSClut::WriteCLUT16_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
WriteCLUT_T16_I8_CSM1((u16*)m_mem->BlockPtr16(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
}
void GSClut::WriteCLUT16_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
WriteCLUT_T16_I4_CSM1((u16*)m_mem->BlockPtr16(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
}
void GSClut::WriteCLUT16S_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
WriteCLUT_T16_I8_CSM1((u16*)m_mem->BlockPtr16S(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
}
void GSClut::WriteCLUT16S_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
WriteCLUT_T16_I4_CSM1((u16*)m_mem->BlockPtr16S(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
}
template <int n>
void GSClut::WriteCLUT32_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSMCT32);
GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COU << 4, TEXCLUT.COV);
u32* vm = m_mem->vm32();
u16* RESTRICT clut = m_clut + ((TEX0.CSA & 15) << 4);
for (int i = 0; i < n; i++)
{
u32 c = vm[pa.value(i)];
clut[i] = (u16)(c & 0xffff);
clut[i + 256] = (u16)(c >> 16);
}
}
template <int n>
void GSClut::WriteCLUT16_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
const GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSMCT16);
const GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COU << 4, TEXCLUT.COV);
u16* vm = m_mem->vm16();
u16* RESTRICT clut = m_clut + (TEX0.CSA << 4);
for (int i = 0; i < n; i++)
{
clut[i] = vm[pa.value(i)];
}
}
template <int n>
void GSClut::WriteCLUT16S_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
const GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSMCT16S);
const GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COU << 4, TEXCLUT.COV);
u16* vm = m_mem->vm16();
u16* RESTRICT clut = m_clut + (TEX0.CSA << 4);
for (int i = 0; i < n; i++)
{
clut[i] = vm[pa.value(i)];
}
}
void GSClut::WriteCLUT_NULL(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
// xenosaga3, bios
GL_INS("[WARNING] CLUT write ignored (psm: %d, cpsm: %d)", TEX0.PSM, TEX0.CPSM);
}
#if 0
void GSClut::Read(const GIFRegTEX0& TEX0)
{
if(m_read.IsDirty(TEX0))
{
m_read.TEX0 = TEX0;
m_read.dirty = false;
u16* clut = m_clut;
if(TEX0.CPSM == PSMCT32 || TEX0.CPSM == PSMCT24)
{
switch(TEX0.PSM)
{
case PSMT8:
case PSMT8H:
clut += (TEX0.CSA & 15) << 4;
ReadCLUT_T32_I8(clut, m_buff32);
break;
case PSMT4:
case PSMT4HL:
case PSMT4HH:
clut += (TEX0.CSA & 15) << 4;
ReadCLUT_T32_I4(clut, m_buff32, m_buff64);
break;
}
}
else if (TEX0.CPSM == PSMCT16 || TEX0.CPSM == PSMCT16S)
{
switch(TEX0.PSM)
{
case PSMT8:
case PSMT8H:
clut += TEX0.CSA << 4;
ReadCLUT_T16_I8(clut, m_buff32);
break;
case PSMT4:
case PSMT4HL:
case PSMT4HH:
clut += TEX0.CSA << 4;
ReadCLUT_T16_I4(clut, m_buff32, m_buff64);
break;
}
}
}
}
#endif
void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
{
if (m_read.IsDirty(TEX0, TEXA))
{
m_read.TEX0 = TEX0;
m_read.TEXA = TEXA;
m_read.dirty = false;
m_read.adirty = true;
u16* clut = m_clut;
if (TEX0.CPSM == PSMCT32 || TEX0.CPSM == PSMCT24)
{
switch (TEX0.PSM)
{
case PSMT8:
case PSMT8H:
ReadCLUT_T32_I8(clut, m_buff32, (TEX0.CSA & 15) << 4);
break;
case PSMT4:
case PSMT4HL:
case PSMT4HH:
clut += (TEX0.CSA & 15) << 4;
// TODO: merge these functions
ReadCLUT_T32_I4(clut, m_buff32);
ExpandCLUT64_T32_I8(m_buff32, (u64*)m_buff64); // sw renderer does not need m_buff64 anymore
break;
}
}
else if (TEX0.CPSM == PSMCT16 || TEX0.CPSM == PSMCT16S)
{
switch (TEX0.PSM)
{
case PSMT8:
case PSMT8H:
clut += TEX0.CSA << 4;
Expand16(clut, m_buff32, 256, TEXA);
break;
case PSMT4:
case PSMT4HL:
case PSMT4HH:
clut += TEX0.CSA << 4;
// TODO: merge these functions
Expand16(clut, m_buff32, 16, TEXA);
ExpandCLUT64_T32_I8(m_buff32, (u64*)m_buff64); // sw renderer does not need m_buff64 anymore
break;
}
}
m_current_gpu_clut = nullptr;
if (GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled)
{
const bool is_4bit = (TEX0.PSM == PSMT4 || TEX0.PSM == PSMT4HL || TEX0.PSM == PSMT4HH);
u32 CBW;
GSVector2i offset;
GSVector2i size;
float scale;
if (!TEX0.CSM)
{
CBW = 0; // don't care
offset = {};
size.x = is_4bit ? 8 : 16;
size.y = is_4bit ? 2 : 16;
}
else
{
CBW = m_write.TEXCLUT.CBW;
offset.x = m_write.TEXCLUT.COU;
offset.y = m_write.TEXCLUT.COV;
size.x = is_4bit ? 16 : 256;
size.y = 1;
}
GSTexture* src = g_gs_renderer->LookupPaletteSource(TEX0.CBP, TEX0.CPSM, CBW, offset, &scale, size);
if (src)
{
GSTexture* dst = is_4bit ? m_gpu_clut4 : m_gpu_clut8;
const u32 dst_size = is_4bit ? 16 : 256;
const u32 dOffset = (TEX0.CSA & ((TEX0.CPSM == PSMCT16 || TEX0.CPSM == PSMCT16S) ? 15u : 31u)) << 4;
if (!dst)
{
// allocate texture lazily
dst = g_gs_device->CreateRenderTarget(dst_size, 1, GSTexture::Format::Color, false);
is_4bit ? (m_gpu_clut4 = dst) : (m_gpu_clut8 = dst);
}
if (dst)
{
GL_PUSH("Update GPU CLUT [CBP=%04X, CPSM=%s, CBW=%u, CSA=%u, Offset=(%d,%d)]",
TEX0.CBP, psm_str(TEX0.CPSM), CBW, TEX0.CSA, offset.x, offset.y);
g_gs_device->UpdateCLUTTexture(src, scale, offset.x, offset.y, dst, dOffset, dst_size);
m_current_gpu_clut = dst;
}
}
}
}
}
void GSClut::GetAlphaMinMax32(int& amin_out, int& amax_out)
{
// call only after Read32
if (m_read.dirty)
GL_INS("GSClut: GetAlphaMinMax32 m_read.dirty");
if (m_read.adirty)
{
m_read.adirty = false;
if (GSLocalMemory::m_psm[m_read.TEX0.CPSM].trbpp == 24 && m_read.TEXA.AEM == 0)
{
m_read.amin = m_read.TEXA.TA0;
m_read.amax = m_read.TEXA.TA0;
}
else
{
const GSVector4i* p = (const GSVector4i*)m_buff32;
GSVector4i amin, amax;
if (GSLocalMemory::m_psm[m_read.TEX0.PSM].pal == 256)
{
amin = GSVector4i::xffffffff();
amax = GSVector4i::zero();
for (int i = 0; i < 16; i++)
{
GSVector4i v0 = (p[i * 4 + 0] >> 24).ps32(p[i * 4 + 1] >> 24);
GSVector4i v1 = (p[i * 4 + 2] >> 24).ps32(p[i * 4 + 3] >> 24);
GSVector4i v2 = v0.pu16(v1);
amin = amin.min_u8(v2);
amax = amax.max_u8(v2);
}
}
else
{
pxAssert(GSLocalMemory::m_psm[m_read.TEX0.PSM].pal == 16);
const GSVector4i v0 = (p[0] >> 24).ps32(p[1] >> 24);
const GSVector4i v1 = (p[2] >> 24).ps32(p[3] >> 24);
const GSVector4i v2 = v0.pu16(v1);
amin = v2;
amax = v2;
}
amin = amin.min_u8(amin.zwxy());
amax = amax.max_u8(amax.zwxy());
amin = amin.min_u8(amin.zwxyl());
amax = amax.max_u8(amax.zwxyl());
amin = amin.min_u8(amin.yxwzl());
amax = amax.max_u8(amax.yxwzl());
const GSVector4i v0 = amin.upl8(amax).u8to16();
const GSVector4i v1 = v0.yxwz();
m_read.amin = v0.min_i16(v1).extract16<0>();
m_read.amax = v0.max_i16(v1).extract16<1>();
}
}
amin_out = m_read.amin;
amax_out = m_read.amax;
}
//
void GSClut::WriteCLUT_T32_I8_CSM1(const u32* RESTRICT src, u16* RESTRICT clut, u16 offset)
{
// This is required when CSA is offset from the base of the CLUT so we point to the right data
for (int i = offset; i < 16; i ++)
{
const int off = i << 4; // WriteCLUT_T32_I4_CSM1 loads 16 at a time
// Source column
const int s = clutTableT32I8[off & 0x70] | (off & 0x80);
WriteCLUT_T32_I4_CSM1(&src[s], &clut[off]);
}
}
__forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const u32* RESTRICT src, u16* RESTRICT clut)
{
// 1 block
#if _M_SSE >= 0x501
GSVector8i* s = (GSVector8i*)src;
GSVector8i* d = (GSVector8i*)clut;
GSVector8i v0 = s[0].acbd();
GSVector8i v1 = s[1].acbd();
GSVector8i::sw16(v0, v1);
GSVector8i::sw16(v0, v1);
GSVector8i::sw16(v0, v1);
d[0] = v0;
d[16] = v1;
#else
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)clut;
GSVector4i v0 = s[0];
GSVector4i v1 = s[1];
GSVector4i v2 = s[2];
GSVector4i v3 = s[3];
GSVector4i::sw16(v0, v1, v2, v3);
GSVector4i::sw32(v0, v1, v2, v3);
GSVector4i::sw16(v0, v2, v1, v3);
d[0] = v0;
d[1] = v2;
d[32] = v1;
d[33] = v3;
#endif
}
void GSClut::WriteCLUT_T16_I8_CSM1(const u16* RESTRICT src, u16* RESTRICT clut)
{
// 2 blocks
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)clut;
for (int i = 0; i < 32; i += 4)
{
GSVector4i v0 = s[i + 0];
GSVector4i v1 = s[i + 1];
GSVector4i v2 = s[i + 2];
GSVector4i v3 = s[i + 3];
GSVector4i::sw16(v0, v1, v2, v3);
GSVector4i::sw32(v0, v1, v2, v3);
GSVector4i::sw16(v0, v2, v1, v3);
d[i + 0] = v0;
d[i + 1] = v2;
d[i + 2] = v1;
d[i + 3] = v3;
}
}
__forceinline void GSClut::WriteCLUT_T16_I4_CSM1(const u16* RESTRICT src, u16* RESTRICT clut)
{
// 1 block (half)
for (int i = 0; i < 16; i++)
{
clut[i] = src[clutTableT16I4[i]];
}
}
void GSClut::ReadCLUT_T32_I8(const u16* RESTRICT clut, u32* RESTRICT dst, int offset)
{
// Okay this deserves a small explanation
// T32 I8 can address up to 256 colors however the offset can be "more than zero" when reading
// Previously I assumed that it would wrap around the end of the buffer to the beginning
// but it turns out this is incorrect, the address doesn't mirror, it clamps to to the last offset,
// probably though some sort of addressing mechanism then picks the color from the lower 0xF of the requested CLUT entry.
// if we don't do this, the dirt on GTA SA goes transparent and actually cleans the car driving through dirt.
for (int i = 0; i < 256; i += 16)
{
// Min value + offet or Last CSA * 16 (240)
ReadCLUT_T32_I4(&clut[std::min((i + offset), 240)], &dst[i]);
}
}
__forceinline void GSClut::ReadCLUT_T32_I4(const u16* RESTRICT clut, u32* RESTRICT dst)
{
GSVector4i* s = (GSVector4i*)clut;
GSVector4i* d = (GSVector4i*)dst;
GSVector4i v0 = s[0];
GSVector4i v1 = s[1];
GSVector4i v2 = s[32];
GSVector4i v3 = s[33];
GSVector4i::sw16(v0, v2, v1, v3);
d[0] = v0;
d[1] = v1;
d[2] = v2;
d[3] = v3;
}
#if 0
__forceinline void GSClut::ReadCLUT_T32_I4(const u16* RESTRICT clut, u32* RESTRICT dst32, u64* RESTRICT dst64)
{
GSVector4i* s = (GSVector4i*)clut;
GSVector4i* d32 = (GSVector4i*)dst32;
GSVector4i* d64 = (GSVector4i*)dst64;
GSVector4i s0 = s[0];
GSVector4i s1 = s[1];
GSVector4i s2 = s[32];
GSVector4i s3 = s[33];
GSVector4i::sw16(s0, s2, s1, s3);
d32[0] = s0;
d32[1] = s1;
d32[2] = s2;
d32[3] = s3;
ExpandCLUT64_T32(s0, s0, s1, s2, s3, &d64[0]);
ExpandCLUT64_T32(s1, s0, s1, s2, s3, &d64[32]);
ExpandCLUT64_T32(s2, s0, s1, s2, s3, &d64[64]);
ExpandCLUT64_T32(s3, s0, s1, s2, s3, &d64[96]);
}
#endif
#if 0
void GSClut::ReadCLUT_T16_I8(const u16* RESTRICT clut, u32* RESTRICT dst)
{
for(int i = 0; i < 256; i += 16)
{
ReadCLUT_T16_I4(&clut[i], &dst[i]);
}
}
#endif
#if 0
__forceinline void GSClut::ReadCLUT_T16_I4(const u16* RESTRICT clut, u32* RESTRICT dst)
{
GSVector4i* s = (GSVector4i*)clut;
GSVector4i* d = (GSVector4i*)dst;
GSVector4i v0 = s[0];
GSVector4i v1 = s[1];
d[0] = v0.upl16();
d[1] = v0.uph16();
d[2] = v1.upl16();
d[3] = v1.uph16();
}
#endif
#if 0
__forceinline void GSClut::ReadCLUT_T16_I4(const u16* RESTRICT clut, u32* RESTRICT dst32, u64* RESTRICT dst64)
{
GSVector4i* s = (GSVector4i*)clut;
GSVector4i* d32 = (GSVector4i*)dst32;
GSVector4i* d64 = (GSVector4i*)dst64;
GSVector4i v0 = s[0];
GSVector4i v1 = s[1];
GSVector4i s0 = v0.upl16();
GSVector4i s1 = v0.uph16();
GSVector4i s2 = v1.upl16();
GSVector4i s3 = v1.uph16();
d32[0] = s0;
d32[1] = s1;
d32[2] = s2;
d32[3] = s3;
ExpandCLUT64_T16(s0, s0, s1, s2, s3, &d64[0]);
ExpandCLUT64_T16(s1, s0, s1, s2, s3, &d64[32]);
ExpandCLUT64_T16(s2, s0, s1, s2, s3, &d64[64]);
ExpandCLUT64_T16(s3, s0, s1, s2, s3, &d64[96]);
}
#endif
void GSClut::ExpandCLUT64_T32_I8(const u32* RESTRICT src, u64* RESTRICT dst)
{
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)dst;
const GSVector4i s0 = s[0];
const GSVector4i s1 = s[1];
const GSVector4i s2 = s[2];
const GSVector4i s3 = s[3];
ExpandCLUT64_T32(s0, s0, s1, s2, s3, &d[0]);
ExpandCLUT64_T32(s1, s0, s1, s2, s3, &d[32]);
ExpandCLUT64_T32(s2, s0, s1, s2, s3, &d[64]);
ExpandCLUT64_T32(s3, s0, s1, s2, s3, &d[96]);
}
__forceinline void GSClut::ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst)
{
ExpandCLUT64_T32(hi.xxxx(), lo0, &dst[0]);
ExpandCLUT64_T32(hi.xxxx(), lo1, &dst[2]);
ExpandCLUT64_T32(hi.xxxx(), lo2, &dst[4]);
ExpandCLUT64_T32(hi.xxxx(), lo3, &dst[6]);
ExpandCLUT64_T32(hi.yyyy(), lo0, &dst[8]);
ExpandCLUT64_T32(hi.yyyy(), lo1, &dst[10]);
ExpandCLUT64_T32(hi.yyyy(), lo2, &dst[12]);
ExpandCLUT64_T32(hi.yyyy(), lo3, &dst[14]);
ExpandCLUT64_T32(hi.zzzz(), lo0, &dst[16]);
ExpandCLUT64_T32(hi.zzzz(), lo1, &dst[18]);
ExpandCLUT64_T32(hi.zzzz(), lo2, &dst[20]);
ExpandCLUT64_T32(hi.zzzz(), lo3, &dst[22]);
ExpandCLUT64_T32(hi.wwww(), lo0, &dst[24]);
ExpandCLUT64_T32(hi.wwww(), lo1, &dst[26]);
ExpandCLUT64_T32(hi.wwww(), lo2, &dst[28]);
ExpandCLUT64_T32(hi.wwww(), lo3, &dst[30]);
}
__forceinline void GSClut::ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst)
{
dst[0] = lo.upl32(hi);
dst[1] = lo.uph32(hi);
}
#if 0
void GSClut::ExpandCLUT64_T16_I8(const u32* RESTRICT src, u64* RESTRICT dst)
{
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)dst;
GSVector4i s0 = s[0];
GSVector4i s1 = s[1];
GSVector4i s2 = s[2];
GSVector4i s3 = s[3];
ExpandCLUT64_T16(s0, s0, s1, s2, s3, &d[0]);
ExpandCLUT64_T16(s1, s0, s1, s2, s3, &d[32]);
ExpandCLUT64_T16(s2, s0, s1, s2, s3, &d[64]);
ExpandCLUT64_T16(s3, s0, s1, s2, s3, &d[96]);
}
#endif
__forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst)
{
ExpandCLUT64_T16(hi.xxxx(), lo0, &dst[0]);
ExpandCLUT64_T16(hi.xxxx(), lo1, &dst[2]);
ExpandCLUT64_T16(hi.xxxx(), lo2, &dst[4]);
ExpandCLUT64_T16(hi.xxxx(), lo3, &dst[6]);
ExpandCLUT64_T16(hi.yyyy(), lo0, &dst[8]);
ExpandCLUT64_T16(hi.yyyy(), lo1, &dst[10]);
ExpandCLUT64_T16(hi.yyyy(), lo2, &dst[12]);
ExpandCLUT64_T16(hi.yyyy(), lo3, &dst[14]);
ExpandCLUT64_T16(hi.zzzz(), lo0, &dst[16]);
ExpandCLUT64_T16(hi.zzzz(), lo1, &dst[18]);
ExpandCLUT64_T16(hi.zzzz(), lo2, &dst[20]);
ExpandCLUT64_T16(hi.zzzz(), lo3, &dst[22]);
ExpandCLUT64_T16(hi.wwww(), lo0, &dst[24]);
ExpandCLUT64_T16(hi.wwww(), lo1, &dst[26]);
ExpandCLUT64_T16(hi.wwww(), lo2, &dst[28]);
ExpandCLUT64_T16(hi.wwww(), lo3, &dst[30]);
}
__forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst)
{
dst[0] = lo.upl16(hi);
dst[1] = lo.uph16(hi);
}
// TODO
constinit const GSVector4i GSClut::m_bm = GSVector4i::cxpr(0x00007c00);
constinit const GSVector4i GSClut::m_gm = GSVector4i::cxpr(0x000003e0);
constinit const GSVector4i GSClut::m_rm = GSVector4i::cxpr(0x0000001f);
void GSClut::Expand16(const u16* RESTRICT src, u32* RESTRICT dst, int w, const GIFRegTEXA& TEXA)
{
pxAssert((w & 7) == 0);
const GSVector4i rm = m_rm;
const GSVector4i gm = m_gm;
const GSVector4i bm = m_bm;
const GSVector4i TA0(TEXA.TA0 << 24);
const GSVector4i TA1(TEXA.TA1 << 24);
GSVector4i c, cl, ch;
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i* d = (GSVector4i*)dst;
if (!TEXA.AEM)
{
for (int i = 0, j = w >> 3; i < j; i++)
{
c = s[i];
cl = c.upl16(c);
ch = c.uph16(c);
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>());
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>());
}
}
else
{
for (int i = 0, j = w >> 3; i < j; i++)
{
c = s[i];
cl = c.upl16(c);
ch = c.uph16(c);
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>()).andnot(cl == GSVector4i::zero());
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>()).andnot(ch == GSVector4i::zero());
}
}
}
bool GSClut::WriteState::IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
constexpr u64 mask = 0x1FFFFFE000000000ull; // CSA CSM CPSM CBP
bool is_dirty = dirty;
if (((this->TEX0.U64 ^ TEX0.U64) & mask) || (GSLocalMemory::m_psm[this->TEX0.PSM].pal != GSLocalMemory::m_psm[TEX0.PSM].pal))
is_dirty |= true;
else if (TEX0.CSM == 1 && (TEXCLUT.U32[0] ^ this->TEXCLUT.U32[0]))
is_dirty |= true;
if (!is_dirty)
{
this->TEX0.U64 = TEX0.U64;
this->TEXCLUT.U64 = TEXCLUT.U64;
}
return is_dirty;
}
bool GSClut::ReadState::IsDirty(const GIFRegTEX0& TEX0)
{
constexpr u64 mask = 0x1FFFFFE000000000ull; // CSA CSM CPSM CBP
bool is_dirty = dirty;
if (((this->TEX0.U64 ^ TEX0.U64) & mask) || (GSLocalMemory::m_psm[this->TEX0.PSM].pal != GSLocalMemory::m_psm[TEX0.PSM].pal))
is_dirty |= true;
if (!is_dirty)
{
this->TEX0.U64 = TEX0.U64;
}
return is_dirty;
}
bool GSClut::ReadState::IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
{
constexpr u64 tex0_mask = 0x1FFFFFE000000000ull; // CSA CSM CPSM CBP
constexpr u64 texa24_mask = 0x80FFull; // AEM TA0
constexpr u64 texa16_mask = 0xFF000080FFull; // TA1 AEM TA0
bool is_dirty = dirty;
if (((this->TEX0.U64 ^ TEX0.U64) & tex0_mask) || (GSLocalMemory::m_psm[this->TEX0.PSM].pal != GSLocalMemory::m_psm[TEX0.PSM].pal))
is_dirty |= true;
else // Just to optimise the checks.
{
// Check TA0 and AEM in 24bit mode.
if (TEX0.CPSM == PSMCT24 && ((this->TEXA.U64 ^ TEXA.U64) & texa24_mask))
is_dirty |= true;
// Check all fields in 16bit mode.
else if (TEX0.CPSM >= PSMCT16 && ((this->TEXA.U64 ^ TEXA.U64) & texa16_mask))
is_dirty |= true;
}
if (!is_dirty)
{
this->TEX0.U64 = TEX0.U64;
this->TEXA.U64 = TEXA.U64;
}
return is_dirty;
}