mirror of
https://github.com/PCSX2/pcsx2.git
synced 2026-01-31 01:15:24 +01:00
929 lines
24 KiB
C++
929 lines
24 KiB
C++
// SPDX-FileCopyrightText: 2002-2025 PCSX2 Dev Team
|
|
// SPDX-License-Identifier: GPL-3.0+
|
|
|
|
#include "GS/GSClut.h"
|
|
#include "GS/GSExtra.h"
|
|
#include "GS/GSLocalMemory.h"
|
|
#include "GS/GSGL.h"
|
|
#include "GS/GSUtil.h"
|
|
#include "GS/Renderers/Common/GSDevice.h"
|
|
#include "GS/Renderers/Common/GSRenderer.h"
|
|
#include "common/AlignedMalloc.h"
|
|
|
|
GSClut::GSClut(GSLocalMemory* mem)
|
|
: m_mem(mem)
|
|
{
|
|
// 1k + 1k for mirrored area simulating wrapping memory
|
|
m_clut = static_cast<u16*>(_aligned_malloc(CLUT_ALLOC_SIZE, VECTOR_ALIGNMENT));
|
|
if (!m_clut)
|
|
pxFailRel("Failed to allocate CLUT storage.");
|
|
|
|
m_buff32 = reinterpret_cast<u32*>(reinterpret_cast<u8*>(m_clut) + 2048); // 1k
|
|
m_buff64 = reinterpret_cast<u64*>(reinterpret_cast<u8*>(m_clut) + 4096); // 2k
|
|
m_write.dirty = 1;
|
|
m_read.dirty = true;
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
for (int j = 0; j < 64; j++)
|
|
{
|
|
// The GS seems to check the lower 3 bits to tell if the format is 8/4bit
|
|
// for the reload.
|
|
const bool eight_bit = (j & 0x7) == 0x3;
|
|
const bool four_bit = (j & 0x7) == 0x4;
|
|
|
|
switch (i)
|
|
{
|
|
case PSMCT32:
|
|
case PSMCT24: // undocumented (KH?)
|
|
if (eight_bit)
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT32_I8_CSM1;
|
|
else if (four_bit)
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT32_I4_CSM1;
|
|
else
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
|
|
break;
|
|
case PSMCT16:
|
|
if (eight_bit)
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT16_I8_CSM1;
|
|
else if (four_bit)
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT16_I4_CSM1;
|
|
else
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
|
|
break;
|
|
case PSMCT16S:
|
|
if (eight_bit)
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT16S_I8_CSM1;
|
|
else if (four_bit)
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT16S_I4_CSM1;
|
|
else
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
|
|
break;
|
|
default:
|
|
m_wc[0][i][j] = &GSClut::WriteCLUT_NULL;
|
|
}
|
|
|
|
// TODO: test this
|
|
m_wc[1][i][j] = &GSClut::WriteCLUT_NULL;
|
|
}
|
|
}
|
|
|
|
m_wc[1][PSMCT32][PSMT8] = &GSClut::WriteCLUT32_CSM2<256>;
|
|
m_wc[1][PSMCT32][PSMT8H] = &GSClut::WriteCLUT32_CSM2<256>;
|
|
m_wc[1][PSMCT32][PSMT4] = &GSClut::WriteCLUT32_CSM2<16>;
|
|
m_wc[1][PSMCT32][PSMT4HL] = &GSClut::WriteCLUT32_CSM2<16>;
|
|
m_wc[1][PSMCT32][PSMT4HH] = &GSClut::WriteCLUT32_CSM2<16>;
|
|
m_wc[1][PSMCT24][PSMT8] = &GSClut::WriteCLUT32_CSM2<256>;
|
|
m_wc[1][PSMCT24][PSMT8H] = &GSClut::WriteCLUT32_CSM2<256>;
|
|
m_wc[1][PSMCT24][PSMT4] = &GSClut::WriteCLUT32_CSM2<16>;
|
|
m_wc[1][PSMCT24][PSMT4HL] = &GSClut::WriteCLUT32_CSM2<16>;
|
|
m_wc[1][PSMCT24][PSMT4HH] = &GSClut::WriteCLUT32_CSM2<16>;
|
|
m_wc[1][PSMCT16][PSMT8] = &GSClut::WriteCLUT16_CSM2<256>;
|
|
m_wc[1][PSMCT16][PSMT8H] = &GSClut::WriteCLUT16_CSM2<256>;
|
|
m_wc[1][PSMCT16][PSMT4] = &GSClut::WriteCLUT16_CSM2<16>;
|
|
m_wc[1][PSMCT16][PSMT4HL] = &GSClut::WriteCLUT16_CSM2<16>;
|
|
m_wc[1][PSMCT16][PSMT4HH] = &GSClut::WriteCLUT16_CSM2<16>;
|
|
m_wc[1][PSMCT16S][PSMT8] = &GSClut::WriteCLUT16S_CSM2<256>;
|
|
m_wc[1][PSMCT16S][PSMT8H] = &GSClut::WriteCLUT16S_CSM2<256>;
|
|
m_wc[1][PSMCT16S][PSMT4] = &GSClut::WriteCLUT16S_CSM2<16>;
|
|
m_wc[1][PSMCT16S][PSMT4HL] = &GSClut::WriteCLUT16S_CSM2<16>;
|
|
m_wc[1][PSMCT16S][PSMT4HH] = &GSClut::WriteCLUT16S_CSM2<16>;
|
|
}
|
|
|
|
GSClut::~GSClut()
|
|
{
|
|
delete m_gpu_clut4;
|
|
delete m_gpu_clut8;
|
|
|
|
_aligned_free(m_clut);
|
|
}
|
|
|
|
u8 GSClut::IsInvalid()
|
|
{
|
|
return m_write.dirty;
|
|
}
|
|
|
|
void GSClut::ClearDrawInvalidity()
|
|
{
|
|
if (m_write.dirty & 2)
|
|
{
|
|
m_write.dirty = 1;
|
|
}
|
|
}
|
|
|
|
u32 GSClut::GetCLUTCBP()
|
|
{
|
|
return m_write.TEX0.CBP;
|
|
}
|
|
|
|
u32 GSClut::GetCLUTCPSM()
|
|
{
|
|
return m_write.TEX0.CPSM;
|
|
}
|
|
|
|
void GSClut::SetNextCLUTTEX0(u64 TEX0)
|
|
{
|
|
m_write.next_tex0 = TEX0;
|
|
}
|
|
|
|
void GSClut::Reset()
|
|
{
|
|
std::memset(m_CBP, 0, sizeof(m_CBP));
|
|
std::memset(m_clut, 0, CLUT_ALLOC_SIZE);
|
|
m_write = {};
|
|
m_write.dirty = 1;
|
|
m_read = {};
|
|
m_read.dirty = true;
|
|
}
|
|
|
|
bool GSClut::InvalidateRange(u32 start_block, u32 end_block, bool is_draw)
|
|
{
|
|
if (m_write.dirty & 2)
|
|
return m_write.dirty;
|
|
|
|
GIFRegTEX0 next_cbp;
|
|
next_cbp.U64 = m_write.next_tex0;
|
|
|
|
// Handle wrapping writes. Star Wars Battlefront 2 does this.
|
|
if ((end_block & 0xFFE0) < (start_block & 0xFFE0))
|
|
{
|
|
if ((next_cbp.CBP + 3U) <= end_block)
|
|
next_cbp.CBP += 0x4000;
|
|
|
|
end_block += 0x4000;
|
|
}
|
|
|
|
if ((next_cbp.CBP + 3U) >= start_block && end_block >= next_cbp.CBP)
|
|
{
|
|
m_write.dirty |= is_draw ? 2 : 1;
|
|
}
|
|
|
|
return m_write.dirty;
|
|
}
|
|
|
|
bool GSClut::CanLoadCLUT(const GIFRegTEX0& TEX0, const bool update_CBP)
|
|
{
|
|
if ((TEX0.PSM & 0x7) < 3)
|
|
return false;
|
|
|
|
switch (TEX0.CLD)
|
|
{
|
|
case 0:
|
|
case 6: // FFX2 menu.
|
|
case 7: // Ford Mustang Racing, Bouken Jidai Katsugeki Goemon.
|
|
return false;
|
|
case 1:
|
|
break;
|
|
case 2:
|
|
if (update_CBP)
|
|
m_CBP[0] = TEX0.CBP;
|
|
break;
|
|
case 3:
|
|
if (update_CBP)
|
|
m_CBP[1] = TEX0.CBP;
|
|
break;
|
|
case 4:
|
|
if (m_CBP[0] == TEX0.CBP)
|
|
return false;
|
|
if(update_CBP)
|
|
m_CBP[0] = TEX0.CBP;
|
|
break;
|
|
case 5:
|
|
if (m_CBP[1] == TEX0.CBP)
|
|
return false;
|
|
if (update_CBP)
|
|
m_CBP[1] = TEX0.CBP;
|
|
break;
|
|
default:
|
|
ASSUME(0);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool GSClut::WriteTest(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
// Check if PSM is an indexed format BEFORE the load condition, updating CBP0/1 on an invalid format is not allowed
|
|
// and can break games. Corvette (NTSC) is a good example of this.
|
|
if ((TEX0.PSM & 0x7) < 3)
|
|
return false;
|
|
|
|
if (!CanLoadCLUT(TEX0, true))
|
|
return false;
|
|
|
|
// CLUT only reloads if PSM is a valid index type, avoid unnecessary flushes.
|
|
return m_write.IsDirty(TEX0, TEXCLUT);
|
|
}
|
|
|
|
void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
m_write.TEX0 = TEX0;
|
|
m_write.TEXCLUT = TEXCLUT;
|
|
m_read.dirty = true;
|
|
m_write.dirty = 0;
|
|
|
|
(this->*m_wc[TEX0.CSM][TEX0.CPSM][TEX0.PSM])(TEX0, TEXCLUT);
|
|
}
|
|
|
|
void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
ALIGN_STACK(32);
|
|
WriteCLUT_T32_I8_CSM1((u32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut, (TEX0.CSA & 15));
|
|
}
|
|
|
|
void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
ALIGN_STACK(32);
|
|
|
|
WriteCLUT_T32_I4_CSM1((u32*)m_mem->BlockPtr32(0, 0, TEX0.CBP, 1), m_clut + ((TEX0.CSA & 15) << 4));
|
|
}
|
|
|
|
void GSClut::WriteCLUT16_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
WriteCLUT_T16_I8_CSM1((u16*)m_mem->BlockPtr16(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
|
|
}
|
|
|
|
void GSClut::WriteCLUT16_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
WriteCLUT_T16_I4_CSM1((u16*)m_mem->BlockPtr16(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
|
|
}
|
|
|
|
void GSClut::WriteCLUT16S_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
WriteCLUT_T16_I8_CSM1((u16*)m_mem->BlockPtr16S(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
|
|
}
|
|
|
|
void GSClut::WriteCLUT16S_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
WriteCLUT_T16_I4_CSM1((u16*)m_mem->BlockPtr16S(0, 0, TEX0.CBP, 1), m_clut + (TEX0.CSA << 4));
|
|
}
|
|
|
|
template <int n>
|
|
void GSClut::WriteCLUT32_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSMCT32);
|
|
GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COU << 4, TEXCLUT.COV);
|
|
|
|
u32* vm = m_mem->vm32();
|
|
u16* RESTRICT clut = m_clut + ((TEX0.CSA & 15) << 4);
|
|
|
|
for (int i = 0; i < n; i++)
|
|
{
|
|
u32 c = vm[pa.value(i)];
|
|
|
|
clut[i] = (u16)(c & 0xffff);
|
|
clut[i + 256] = (u16)(c >> 16);
|
|
}
|
|
}
|
|
|
|
template <int n>
|
|
void GSClut::WriteCLUT16_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
const GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSMCT16);
|
|
const GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COU << 4, TEXCLUT.COV);
|
|
|
|
u16* vm = m_mem->vm16();
|
|
u16* RESTRICT clut = m_clut + (TEX0.CSA << 4);
|
|
|
|
for (int i = 0; i < n; i++)
|
|
{
|
|
clut[i] = vm[pa.value(i)];
|
|
}
|
|
}
|
|
|
|
template <int n>
|
|
void GSClut::WriteCLUT16S_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
const GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSMCT16S);
|
|
const GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COU << 4, TEXCLUT.COV);
|
|
|
|
u16* vm = m_mem->vm16();
|
|
u16* RESTRICT clut = m_clut + (TEX0.CSA << 4);
|
|
|
|
for (int i = 0; i < n; i++)
|
|
{
|
|
clut[i] = vm[pa.value(i)];
|
|
}
|
|
}
|
|
|
|
void GSClut::WriteCLUT_NULL(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
// xenosaga3, bios
|
|
GL_INS("[WARNING] CLUT write ignored (psm: %d, cpsm: %d)", TEX0.PSM, TEX0.CPSM);
|
|
}
|
|
|
|
#if 0
|
|
void GSClut::Read(const GIFRegTEX0& TEX0)
|
|
{
|
|
if(m_read.IsDirty(TEX0))
|
|
{
|
|
m_read.TEX0 = TEX0;
|
|
m_read.dirty = false;
|
|
|
|
u16* clut = m_clut;
|
|
|
|
if(TEX0.CPSM == PSMCT32 || TEX0.CPSM == PSMCT24)
|
|
{
|
|
switch(TEX0.PSM)
|
|
{
|
|
case PSMT8:
|
|
case PSMT8H:
|
|
clut += (TEX0.CSA & 15) << 4;
|
|
ReadCLUT_T32_I8(clut, m_buff32);
|
|
break;
|
|
case PSMT4:
|
|
case PSMT4HL:
|
|
case PSMT4HH:
|
|
clut += (TEX0.CSA & 15) << 4;
|
|
ReadCLUT_T32_I4(clut, m_buff32, m_buff64);
|
|
break;
|
|
}
|
|
}
|
|
else if (TEX0.CPSM == PSMCT16 || TEX0.CPSM == PSMCT16S)
|
|
{
|
|
switch(TEX0.PSM)
|
|
{
|
|
case PSMT8:
|
|
case PSMT8H:
|
|
clut += TEX0.CSA << 4;
|
|
ReadCLUT_T16_I8(clut, m_buff32);
|
|
break;
|
|
case PSMT4:
|
|
case PSMT4HL:
|
|
case PSMT4HH:
|
|
clut += TEX0.CSA << 4;
|
|
ReadCLUT_T16_I4(clut, m_buff32, m_buff64);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void GSClut::Read32(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
|
|
{
|
|
if (m_read.IsDirty(TEX0, TEXA))
|
|
{
|
|
m_read.TEX0 = TEX0;
|
|
m_read.TEXA = TEXA;
|
|
m_read.dirty = false;
|
|
m_read.adirty = true;
|
|
|
|
u16* clut = m_clut;
|
|
|
|
if (TEX0.CPSM == PSMCT32 || TEX0.CPSM == PSMCT24)
|
|
{
|
|
switch (TEX0.PSM)
|
|
{
|
|
case PSMT8:
|
|
case PSMT8H:
|
|
ReadCLUT_T32_I8(clut, m_buff32, (TEX0.CSA & 15) << 4);
|
|
break;
|
|
case PSMT4:
|
|
case PSMT4HL:
|
|
case PSMT4HH:
|
|
clut += (TEX0.CSA & 15) << 4;
|
|
// TODO: merge these functions
|
|
ReadCLUT_T32_I4(clut, m_buff32);
|
|
ExpandCLUT64_T32_I8(m_buff32, (u64*)m_buff64); // sw renderer does not need m_buff64 anymore
|
|
break;
|
|
}
|
|
}
|
|
else if (TEX0.CPSM == PSMCT16 || TEX0.CPSM == PSMCT16S)
|
|
{
|
|
switch (TEX0.PSM)
|
|
{
|
|
case PSMT8:
|
|
case PSMT8H:
|
|
clut += TEX0.CSA << 4;
|
|
Expand16(clut, m_buff32, 256, TEXA);
|
|
break;
|
|
case PSMT4:
|
|
case PSMT4HL:
|
|
case PSMT4HH:
|
|
clut += TEX0.CSA << 4;
|
|
// TODO: merge these functions
|
|
Expand16(clut, m_buff32, 16, TEXA);
|
|
ExpandCLUT64_T32_I8(m_buff32, (u64*)m_buff64); // sw renderer does not need m_buff64 anymore
|
|
break;
|
|
}
|
|
}
|
|
|
|
m_current_gpu_clut = nullptr;
|
|
if (GSConfig.UserHacks_GPUTargetCLUTMode != GSGPUTargetCLUTMode::Disabled)
|
|
{
|
|
const bool is_4bit = (TEX0.PSM == PSMT4 || TEX0.PSM == PSMT4HL || TEX0.PSM == PSMT4HH);
|
|
|
|
u32 CBW;
|
|
GSVector2i offset;
|
|
GSVector2i size;
|
|
float scale;
|
|
if (!TEX0.CSM)
|
|
{
|
|
CBW = 0; // don't care
|
|
offset = {};
|
|
size.x = is_4bit ? 8 : 16;
|
|
size.y = is_4bit ? 2 : 16;
|
|
}
|
|
else
|
|
{
|
|
CBW = m_write.TEXCLUT.CBW;
|
|
offset.x = m_write.TEXCLUT.COU;
|
|
offset.y = m_write.TEXCLUT.COV;
|
|
size.x = is_4bit ? 16 : 256;
|
|
size.y = 1;
|
|
}
|
|
|
|
GSTexture* src = g_gs_renderer->LookupPaletteSource(TEX0.CBP, TEX0.CPSM, CBW, offset, &scale, size);
|
|
if (src)
|
|
{
|
|
GSTexture* dst = is_4bit ? m_gpu_clut4 : m_gpu_clut8;
|
|
const u32 dst_size = is_4bit ? 16 : 256;
|
|
const u32 dOffset = (TEX0.CSA & ((TEX0.CPSM == PSMCT16 || TEX0.CPSM == PSMCT16S) ? 15u : 31u)) << 4;
|
|
if (!dst)
|
|
{
|
|
// allocate texture lazily
|
|
dst = g_gs_device->CreateRenderTarget(dst_size, 1, GSTexture::Format::Color, false);
|
|
is_4bit ? (m_gpu_clut4 = dst) : (m_gpu_clut8 = dst);
|
|
}
|
|
if (dst)
|
|
{
|
|
GL_PUSH("Update GPU CLUT [CBP=%04X, CPSM=%s, CBW=%u, CSA=%u, Offset=(%d,%d)]",
|
|
TEX0.CBP, psm_str(TEX0.CPSM), CBW, TEX0.CSA, offset.x, offset.y);
|
|
g_gs_device->UpdateCLUTTexture(src, scale, offset.x, offset.y, dst, dOffset, dst_size);
|
|
m_current_gpu_clut = dst;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GSClut::GetAlphaMinMax32(int& amin_out, int& amax_out)
|
|
{
|
|
// call only after Read32
|
|
|
|
if (m_read.dirty)
|
|
GL_INS("GSClut: GetAlphaMinMax32 m_read.dirty");
|
|
|
|
if (m_read.adirty)
|
|
{
|
|
m_read.adirty = false;
|
|
|
|
if (GSLocalMemory::m_psm[m_read.TEX0.CPSM].trbpp == 24 && m_read.TEXA.AEM == 0)
|
|
{
|
|
m_read.amin = m_read.TEXA.TA0;
|
|
m_read.amax = m_read.TEXA.TA0;
|
|
}
|
|
else
|
|
{
|
|
const GSVector4i* p = (const GSVector4i*)m_buff32;
|
|
|
|
GSVector4i amin, amax;
|
|
|
|
if (GSLocalMemory::m_psm[m_read.TEX0.PSM].pal == 256)
|
|
{
|
|
amin = GSVector4i::xffffffff();
|
|
amax = GSVector4i::zero();
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
GSVector4i v0 = (p[i * 4 + 0] >> 24).ps32(p[i * 4 + 1] >> 24);
|
|
GSVector4i v1 = (p[i * 4 + 2] >> 24).ps32(p[i * 4 + 3] >> 24);
|
|
GSVector4i v2 = v0.pu16(v1);
|
|
|
|
amin = amin.min_u8(v2);
|
|
amax = amax.max_u8(v2);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pxAssert(GSLocalMemory::m_psm[m_read.TEX0.PSM].pal == 16);
|
|
|
|
const GSVector4i v0 = (p[0] >> 24).ps32(p[1] >> 24);
|
|
const GSVector4i v1 = (p[2] >> 24).ps32(p[3] >> 24);
|
|
const GSVector4i v2 = v0.pu16(v1);
|
|
|
|
amin = v2;
|
|
amax = v2;
|
|
}
|
|
|
|
amin = amin.min_u8(amin.zwxy());
|
|
amax = amax.max_u8(amax.zwxy());
|
|
amin = amin.min_u8(amin.zwxyl());
|
|
amax = amax.max_u8(amax.zwxyl());
|
|
amin = amin.min_u8(amin.yxwzl());
|
|
amax = amax.max_u8(amax.yxwzl());
|
|
|
|
const GSVector4i v0 = amin.upl8(amax).u8to16();
|
|
const GSVector4i v1 = v0.yxwz();
|
|
|
|
m_read.amin = v0.min_i16(v1).extract16<0>();
|
|
m_read.amax = v0.max_i16(v1).extract16<1>();
|
|
}
|
|
}
|
|
|
|
amin_out = m_read.amin;
|
|
amax_out = m_read.amax;
|
|
}
|
|
|
|
//
|
|
|
|
void GSClut::WriteCLUT_T32_I8_CSM1(const u32* RESTRICT src, u16* RESTRICT clut, u16 offset)
|
|
{
|
|
// This is required when CSA is offset from the base of the CLUT so we point to the right data
|
|
for (int i = offset; i < 16; i ++)
|
|
{
|
|
const int off = i << 4; // WriteCLUT_T32_I4_CSM1 loads 16 at a time
|
|
// Source column
|
|
const int s = clutTableT32I8[off & 0x70] | (off & 0x80);
|
|
|
|
WriteCLUT_T32_I4_CSM1(&src[s], &clut[off]);
|
|
}
|
|
}
|
|
|
|
__forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const u32* RESTRICT src, u16* RESTRICT clut)
|
|
{
|
|
// 1 block
|
|
|
|
#if _M_SSE >= 0x501
|
|
|
|
GSVector8i* s = (GSVector8i*)src;
|
|
GSVector8i* d = (GSVector8i*)clut;
|
|
|
|
GSVector8i v0 = s[0].acbd();
|
|
GSVector8i v1 = s[1].acbd();
|
|
|
|
GSVector8i::sw16(v0, v1);
|
|
GSVector8i::sw16(v0, v1);
|
|
GSVector8i::sw16(v0, v1);
|
|
|
|
d[0] = v0;
|
|
d[16] = v1;
|
|
|
|
#else
|
|
|
|
GSVector4i* s = (GSVector4i*)src;
|
|
GSVector4i* d = (GSVector4i*)clut;
|
|
|
|
GSVector4i v0 = s[0];
|
|
GSVector4i v1 = s[1];
|
|
GSVector4i v2 = s[2];
|
|
GSVector4i v3 = s[3];
|
|
|
|
GSVector4i::sw16(v0, v1, v2, v3);
|
|
GSVector4i::sw32(v0, v1, v2, v3);
|
|
GSVector4i::sw16(v0, v2, v1, v3);
|
|
|
|
d[0] = v0;
|
|
d[1] = v2;
|
|
d[32] = v1;
|
|
d[33] = v3;
|
|
|
|
#endif
|
|
}
|
|
|
|
void GSClut::WriteCLUT_T16_I8_CSM1(const u16* RESTRICT src, u16* RESTRICT clut)
|
|
{
|
|
// 2 blocks
|
|
|
|
GSVector4i* s = (GSVector4i*)src;
|
|
GSVector4i* d = (GSVector4i*)clut;
|
|
|
|
for (int i = 0; i < 32; i += 4)
|
|
{
|
|
GSVector4i v0 = s[i + 0];
|
|
GSVector4i v1 = s[i + 1];
|
|
GSVector4i v2 = s[i + 2];
|
|
GSVector4i v3 = s[i + 3];
|
|
|
|
GSVector4i::sw16(v0, v1, v2, v3);
|
|
GSVector4i::sw32(v0, v1, v2, v3);
|
|
GSVector4i::sw16(v0, v2, v1, v3);
|
|
|
|
d[i + 0] = v0;
|
|
d[i + 1] = v2;
|
|
d[i + 2] = v1;
|
|
d[i + 3] = v3;
|
|
}
|
|
}
|
|
|
|
__forceinline void GSClut::WriteCLUT_T16_I4_CSM1(const u16* RESTRICT src, u16* RESTRICT clut)
|
|
{
|
|
// 1 block (half)
|
|
|
|
for (int i = 0; i < 16; i++)
|
|
{
|
|
clut[i] = src[clutTableT16I4[i]];
|
|
}
|
|
}
|
|
|
|
void GSClut::ReadCLUT_T32_I8(const u16* RESTRICT clut, u32* RESTRICT dst, int offset)
|
|
{
|
|
// Okay this deserves a small explanation
|
|
// T32 I8 can address up to 256 colors however the offset can be "more than zero" when reading
|
|
// Previously I assumed that it would wrap around the end of the buffer to the beginning
|
|
// but it turns out this is incorrect, the address doesn't mirror, it clamps to to the last offset,
|
|
// probably though some sort of addressing mechanism then picks the color from the lower 0xF of the requested CLUT entry.
|
|
// if we don't do this, the dirt on GTA SA goes transparent and actually cleans the car driving through dirt.
|
|
for (int i = 0; i < 256; i += 16)
|
|
{
|
|
// Min value + offet or Last CSA * 16 (240)
|
|
ReadCLUT_T32_I4(&clut[std::min((i + offset), 240)], &dst[i]);
|
|
}
|
|
}
|
|
|
|
__forceinline void GSClut::ReadCLUT_T32_I4(const u16* RESTRICT clut, u32* RESTRICT dst)
|
|
{
|
|
GSVector4i* s = (GSVector4i*)clut;
|
|
GSVector4i* d = (GSVector4i*)dst;
|
|
|
|
GSVector4i v0 = s[0];
|
|
GSVector4i v1 = s[1];
|
|
GSVector4i v2 = s[32];
|
|
GSVector4i v3 = s[33];
|
|
|
|
GSVector4i::sw16(v0, v2, v1, v3);
|
|
|
|
d[0] = v0;
|
|
d[1] = v1;
|
|
d[2] = v2;
|
|
d[3] = v3;
|
|
}
|
|
|
|
#if 0
|
|
__forceinline void GSClut::ReadCLUT_T32_I4(const u16* RESTRICT clut, u32* RESTRICT dst32, u64* RESTRICT dst64)
|
|
{
|
|
GSVector4i* s = (GSVector4i*)clut;
|
|
GSVector4i* d32 = (GSVector4i*)dst32;
|
|
GSVector4i* d64 = (GSVector4i*)dst64;
|
|
|
|
GSVector4i s0 = s[0];
|
|
GSVector4i s1 = s[1];
|
|
GSVector4i s2 = s[32];
|
|
GSVector4i s3 = s[33];
|
|
|
|
GSVector4i::sw16(s0, s2, s1, s3);
|
|
|
|
d32[0] = s0;
|
|
d32[1] = s1;
|
|
d32[2] = s2;
|
|
d32[3] = s3;
|
|
|
|
ExpandCLUT64_T32(s0, s0, s1, s2, s3, &d64[0]);
|
|
ExpandCLUT64_T32(s1, s0, s1, s2, s3, &d64[32]);
|
|
ExpandCLUT64_T32(s2, s0, s1, s2, s3, &d64[64]);
|
|
ExpandCLUT64_T32(s3, s0, s1, s2, s3, &d64[96]);
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
void GSClut::ReadCLUT_T16_I8(const u16* RESTRICT clut, u32* RESTRICT dst)
|
|
{
|
|
for(int i = 0; i < 256; i += 16)
|
|
{
|
|
ReadCLUT_T16_I4(&clut[i], &dst[i]);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
__forceinline void GSClut::ReadCLUT_T16_I4(const u16* RESTRICT clut, u32* RESTRICT dst)
|
|
{
|
|
GSVector4i* s = (GSVector4i*)clut;
|
|
GSVector4i* d = (GSVector4i*)dst;
|
|
|
|
GSVector4i v0 = s[0];
|
|
GSVector4i v1 = s[1];
|
|
|
|
d[0] = v0.upl16();
|
|
d[1] = v0.uph16();
|
|
d[2] = v1.upl16();
|
|
d[3] = v1.uph16();
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
__forceinline void GSClut::ReadCLUT_T16_I4(const u16* RESTRICT clut, u32* RESTRICT dst32, u64* RESTRICT dst64)
|
|
{
|
|
GSVector4i* s = (GSVector4i*)clut;
|
|
GSVector4i* d32 = (GSVector4i*)dst32;
|
|
GSVector4i* d64 = (GSVector4i*)dst64;
|
|
|
|
GSVector4i v0 = s[0];
|
|
GSVector4i v1 = s[1];
|
|
|
|
GSVector4i s0 = v0.upl16();
|
|
GSVector4i s1 = v0.uph16();
|
|
GSVector4i s2 = v1.upl16();
|
|
GSVector4i s3 = v1.uph16();
|
|
|
|
d32[0] = s0;
|
|
d32[1] = s1;
|
|
d32[2] = s2;
|
|
d32[3] = s3;
|
|
|
|
ExpandCLUT64_T16(s0, s0, s1, s2, s3, &d64[0]);
|
|
ExpandCLUT64_T16(s1, s0, s1, s2, s3, &d64[32]);
|
|
ExpandCLUT64_T16(s2, s0, s1, s2, s3, &d64[64]);
|
|
ExpandCLUT64_T16(s3, s0, s1, s2, s3, &d64[96]);
|
|
}
|
|
#endif
|
|
|
|
void GSClut::ExpandCLUT64_T32_I8(const u32* RESTRICT src, u64* RESTRICT dst)
|
|
{
|
|
GSVector4i* s = (GSVector4i*)src;
|
|
GSVector4i* d = (GSVector4i*)dst;
|
|
|
|
const GSVector4i s0 = s[0];
|
|
const GSVector4i s1 = s[1];
|
|
const GSVector4i s2 = s[2];
|
|
const GSVector4i s3 = s[3];
|
|
|
|
ExpandCLUT64_T32(s0, s0, s1, s2, s3, &d[0]);
|
|
ExpandCLUT64_T32(s1, s0, s1, s2, s3, &d[32]);
|
|
ExpandCLUT64_T32(s2, s0, s1, s2, s3, &d[64]);
|
|
ExpandCLUT64_T32(s3, s0, s1, s2, s3, &d[96]);
|
|
}
|
|
|
|
__forceinline void GSClut::ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst)
|
|
{
|
|
ExpandCLUT64_T32(hi.xxxx(), lo0, &dst[0]);
|
|
ExpandCLUT64_T32(hi.xxxx(), lo1, &dst[2]);
|
|
ExpandCLUT64_T32(hi.xxxx(), lo2, &dst[4]);
|
|
ExpandCLUT64_T32(hi.xxxx(), lo3, &dst[6]);
|
|
ExpandCLUT64_T32(hi.yyyy(), lo0, &dst[8]);
|
|
ExpandCLUT64_T32(hi.yyyy(), lo1, &dst[10]);
|
|
ExpandCLUT64_T32(hi.yyyy(), lo2, &dst[12]);
|
|
ExpandCLUT64_T32(hi.yyyy(), lo3, &dst[14]);
|
|
ExpandCLUT64_T32(hi.zzzz(), lo0, &dst[16]);
|
|
ExpandCLUT64_T32(hi.zzzz(), lo1, &dst[18]);
|
|
ExpandCLUT64_T32(hi.zzzz(), lo2, &dst[20]);
|
|
ExpandCLUT64_T32(hi.zzzz(), lo3, &dst[22]);
|
|
ExpandCLUT64_T32(hi.wwww(), lo0, &dst[24]);
|
|
ExpandCLUT64_T32(hi.wwww(), lo1, &dst[26]);
|
|
ExpandCLUT64_T32(hi.wwww(), lo2, &dst[28]);
|
|
ExpandCLUT64_T32(hi.wwww(), lo3, &dst[30]);
|
|
}
|
|
|
|
__forceinline void GSClut::ExpandCLUT64_T32(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst)
|
|
{
|
|
dst[0] = lo.upl32(hi);
|
|
dst[1] = lo.uph32(hi);
|
|
}
|
|
|
|
#if 0
|
|
void GSClut::ExpandCLUT64_T16_I8(const u32* RESTRICT src, u64* RESTRICT dst)
|
|
{
|
|
GSVector4i* s = (GSVector4i*)src;
|
|
GSVector4i* d = (GSVector4i*)dst;
|
|
|
|
GSVector4i s0 = s[0];
|
|
GSVector4i s1 = s[1];
|
|
GSVector4i s2 = s[2];
|
|
GSVector4i s3 = s[3];
|
|
|
|
ExpandCLUT64_T16(s0, s0, s1, s2, s3, &d[0]);
|
|
ExpandCLUT64_T16(s1, s0, s1, s2, s3, &d[32]);
|
|
ExpandCLUT64_T16(s2, s0, s1, s2, s3, &d[64]);
|
|
ExpandCLUT64_T16(s3, s0, s1, s2, s3, &d[96]);
|
|
}
|
|
#endif
|
|
|
|
__forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo0, const GSVector4i& lo1, const GSVector4i& lo2, const GSVector4i& lo3, GSVector4i* dst)
|
|
{
|
|
ExpandCLUT64_T16(hi.xxxx(), lo0, &dst[0]);
|
|
ExpandCLUT64_T16(hi.xxxx(), lo1, &dst[2]);
|
|
ExpandCLUT64_T16(hi.xxxx(), lo2, &dst[4]);
|
|
ExpandCLUT64_T16(hi.xxxx(), lo3, &dst[6]);
|
|
ExpandCLUT64_T16(hi.yyyy(), lo0, &dst[8]);
|
|
ExpandCLUT64_T16(hi.yyyy(), lo1, &dst[10]);
|
|
ExpandCLUT64_T16(hi.yyyy(), lo2, &dst[12]);
|
|
ExpandCLUT64_T16(hi.yyyy(), lo3, &dst[14]);
|
|
ExpandCLUT64_T16(hi.zzzz(), lo0, &dst[16]);
|
|
ExpandCLUT64_T16(hi.zzzz(), lo1, &dst[18]);
|
|
ExpandCLUT64_T16(hi.zzzz(), lo2, &dst[20]);
|
|
ExpandCLUT64_T16(hi.zzzz(), lo3, &dst[22]);
|
|
ExpandCLUT64_T16(hi.wwww(), lo0, &dst[24]);
|
|
ExpandCLUT64_T16(hi.wwww(), lo1, &dst[26]);
|
|
ExpandCLUT64_T16(hi.wwww(), lo2, &dst[28]);
|
|
ExpandCLUT64_T16(hi.wwww(), lo3, &dst[30]);
|
|
}
|
|
|
|
__forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector4i& lo, GSVector4i* dst)
|
|
{
|
|
dst[0] = lo.upl16(hi);
|
|
dst[1] = lo.uph16(hi);
|
|
}
|
|
|
|
// TODO
|
|
|
|
constinit const GSVector4i GSClut::m_bm = GSVector4i::cxpr(0x00007c00);
|
|
constinit const GSVector4i GSClut::m_gm = GSVector4i::cxpr(0x000003e0);
|
|
constinit const GSVector4i GSClut::m_rm = GSVector4i::cxpr(0x0000001f);
|
|
|
|
void GSClut::Expand16(const u16* RESTRICT src, u32* RESTRICT dst, int w, const GIFRegTEXA& TEXA)
|
|
{
|
|
pxAssert((w & 7) == 0);
|
|
|
|
const GSVector4i rm = m_rm;
|
|
const GSVector4i gm = m_gm;
|
|
const GSVector4i bm = m_bm;
|
|
|
|
const GSVector4i TA0(TEXA.TA0 << 24);
|
|
const GSVector4i TA1(TEXA.TA1 << 24);
|
|
|
|
GSVector4i c, cl, ch;
|
|
|
|
const GSVector4i* s = (const GSVector4i*)src;
|
|
GSVector4i* d = (GSVector4i*)dst;
|
|
|
|
if (!TEXA.AEM)
|
|
{
|
|
for (int i = 0, j = w >> 3; i < j; i++)
|
|
{
|
|
c = s[i];
|
|
cl = c.upl16(c);
|
|
ch = c.uph16(c);
|
|
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>());
|
|
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (int i = 0, j = w >> 3; i < j; i++)
|
|
{
|
|
c = s[i];
|
|
cl = c.upl16(c);
|
|
ch = c.uph16(c);
|
|
d[i * 2 + 0] = ((cl & rm) << 3) | ((cl & gm) << 6) | ((cl & bm) << 9) | TA0.blend8(TA1, cl.sra16<15>()).andnot(cl == GSVector4i::zero());
|
|
d[i * 2 + 1] = ((ch & rm) << 3) | ((ch & gm) << 6) | ((ch & bm) << 9) | TA0.blend8(TA1, ch.sra16<15>()).andnot(ch == GSVector4i::zero());
|
|
}
|
|
}
|
|
}
|
|
|
|
bool GSClut::WriteState::IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
|
|
{
|
|
constexpr u64 mask = 0x1FFFFFE000000000ull; // CSA CSM CPSM CBP
|
|
|
|
bool is_dirty = dirty;
|
|
|
|
if (((this->TEX0.U64 ^ TEX0.U64) & mask) || (GSLocalMemory::m_psm[this->TEX0.PSM].pal != GSLocalMemory::m_psm[TEX0.PSM].pal))
|
|
is_dirty |= true;
|
|
else if (TEX0.CSM == 1 && (TEXCLUT.U32[0] ^ this->TEXCLUT.U32[0]))
|
|
is_dirty |= true;
|
|
|
|
if (!is_dirty)
|
|
{
|
|
this->TEX0.U64 = TEX0.U64;
|
|
this->TEXCLUT.U64 = TEXCLUT.U64;
|
|
}
|
|
|
|
return is_dirty;
|
|
}
|
|
|
|
bool GSClut::ReadState::IsDirty(const GIFRegTEX0& TEX0)
|
|
{
|
|
constexpr u64 mask = 0x1FFFFFE000000000ull; // CSA CSM CPSM CBP
|
|
|
|
bool is_dirty = dirty;
|
|
|
|
if (((this->TEX0.U64 ^ TEX0.U64) & mask) || (GSLocalMemory::m_psm[this->TEX0.PSM].pal != GSLocalMemory::m_psm[TEX0.PSM].pal))
|
|
is_dirty |= true;
|
|
|
|
if (!is_dirty)
|
|
{
|
|
this->TEX0.U64 = TEX0.U64;
|
|
}
|
|
|
|
return is_dirty;
|
|
}
|
|
|
|
bool GSClut::ReadState::IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)
|
|
{
|
|
constexpr u64 tex0_mask = 0x1FFFFFE000000000ull; // CSA CSM CPSM CBP
|
|
constexpr u64 texa24_mask = 0x80FFull; // AEM TA0
|
|
constexpr u64 texa16_mask = 0xFF000080FFull; // TA1 AEM TA0
|
|
|
|
bool is_dirty = dirty;
|
|
|
|
if (((this->TEX0.U64 ^ TEX0.U64) & tex0_mask) || (GSLocalMemory::m_psm[this->TEX0.PSM].pal != GSLocalMemory::m_psm[TEX0.PSM].pal))
|
|
is_dirty |= true;
|
|
else // Just to optimise the checks.
|
|
{
|
|
// Check TA0 and AEM in 24bit mode.
|
|
if (TEX0.CPSM == PSMCT24 && ((this->TEXA.U64 ^ TEXA.U64) & texa24_mask))
|
|
is_dirty |= true;
|
|
// Check all fields in 16bit mode.
|
|
else if (TEX0.CPSM >= PSMCT16 && ((this->TEXA.U64 ^ TEXA.U64) & texa16_mask))
|
|
is_dirty |= true;
|
|
}
|
|
|
|
if (!is_dirty)
|
|
{
|
|
this->TEX0.U64 = TEX0.U64;
|
|
this->TEXA.U64 = TEXA.U64;
|
|
}
|
|
|
|
return is_dirty;
|
|
}
|