beetle-psx-libretro/mednafen/psx/mdec.cpp

801 lines
19 KiB
C++

/* Mednafen - Multi-system Emulator
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
MDEC_READ_FIFO(tfr) vs InCounter vs MDEC_DMACanRead() is a bit fragile right now. Actually, the entire horrible state machine monstrosity is fragile.
TODO: OutFIFOReady, so <16bpp works right.
TODO CODE:
bool InFIFOReady;
if(InFIFO.CanWrite())
{
InFIFO.Write(V);
if(InCommand)
{
if(InCounter != 0xFFFF)
{
InCounter--;
// This condition when InFIFO.CanWrite() != 0 is a bit buggy on real hardware, decoding loop *seems* to be reading too
// much and pulling garbage from the FIFO.
if(InCounter == 0xFFFF)
InFIFOReady = true;
}
if(InFIFO.CanWrite() == 0)
InFIFOReady = true;
}
}
*/
// Good test-case games:
// Dragon Knight 4(bad disc?)
// Final Fantasy 7 intro movie.
// GameShark Version 4.0 intro movie; (clever) abuse of DMA channel 0.
// SimCity 2000 startup.
#include "psx.h"
#include "mdec.h"
#include "../masmem.h"
#include "FastFIFO.h"
#include <math.h>
#if defined(__SSE2__)
#include <xmmintrin.h>
#include <emmintrin.h>
#endif
#if defined(ARCH_POWERPC_ALTIVEC) && defined(HAVE_ALTIVEC_H)
#include <altivec.h>
#endif
static int32 ClockCounter;
static unsigned MDRPhase;
static FastFIFO<uint32, 0x20> InFIFO;
static FastFIFO<uint32, 0x20> OutFIFO;
static int8 block_y[8][8];
static int8 block_cb[8][8]; // [y >> 1][x >> 1]
static int8 block_cr[8][8]; // [y >> 1][x >> 1]
static uint32 Control;
static uint32 Command;
static bool InCommand;
static uint8 QMatrix[2][64];
static uint32 QMIndex;
static int16 IDCTMatrix[64] MDFN_ALIGN(16);
static uint32 IDCTMIndex;
static uint8 QScale;
static int16 Coeff[64] MDFN_ALIGN(16);
static uint32 CoeffIndex;
static uint32 DecodeWB;
static union
{
uint32 pix32[48];
uint16 pix16[96];
uint8 pix8[192];
} PixelBuffer;
static uint32 PixelBufferReadOffset;
static uint32 PixelBufferCount32;
static uint16 InCounter;
static uint8 RAMOffsetY;
static uint8 RAMOffsetCounter;
static uint8 RAMOffsetWWS;
static const uint8 ZigZag[64] =
{
0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
0x0a, 0x03, 0x04, 0x0b, 0x12, 0x19, 0x20, 0x28,
0x21, 0x1a, 0x13, 0x0c, 0x05, 0x06, 0x0d, 0x14,
0x1b, 0x22, 0x29, 0x30, 0x38, 0x31, 0x2a, 0x23,
0x1c, 0x15, 0x0e, 0x07, 0x0f, 0x16, 0x1d, 0x24,
0x2b, 0x32, 0x39, 0x3a, 0x33, 0x2c, 0x25, 0x1e,
0x17, 0x1f, 0x26, 0x2d, 0x34, 0x3b, 0x3c, 0x35,
0x2e, 0x27, 0x2f, 0x36, 0x3d, 0x3e, 0x37, 0x3f,
};
void MDEC_Power(void)
{
ClockCounter = 0;
MDRPhase = 0;
InFIFO.Flush();
OutFIFO.Flush();
memset(block_y, 0, sizeof(block_y));
memset(block_cb, 0, sizeof(block_cb));
memset(block_cr, 0, sizeof(block_cr));
Control = 0;
Command = 0;
InCommand = false;
memset(QMatrix, 0, sizeof(QMatrix));
QMIndex = 0;
memset(IDCTMatrix, 0, sizeof(IDCTMatrix));
IDCTMIndex = 0;
QScale = 0;
memset(Coeff, 0, sizeof(Coeff));
CoeffIndex = 0;
DecodeWB = 0;
memset(PixelBuffer.pix32, 0, sizeof(PixelBuffer.pix32));
PixelBufferReadOffset = 0;
PixelBufferCount32 = 0;
InCounter = 0;
RAMOffsetY = 0;
RAMOffsetCounter = 0;
RAMOffsetWWS = 0;
}
int MDEC_StateAction(StateMem *sm, int load, int data_only)
{
SFORMAT StateRegs[] =
{
SFVAR(ClockCounter),
SFVAR(MDRPhase),
#define SFFIFO32(fifoobj) SFARRAY32(&fifoobj.data[0], sizeof(fifoobj.data) / sizeof(fifoobj.data[0])), \
SFVAR(fifoobj.read_pos), \
SFVAR(fifoobj.write_pos), \
SFVAR(fifoobj.in_count)
SFFIFO32(InFIFO),
SFFIFO32(OutFIFO),
#undef SFFIFO
SFARRAY(&block_y[0][0], sizeof(block_y) / sizeof(block_y[0][0])),
SFARRAY(&block_cb[0][0], sizeof(block_cb) / sizeof(block_cb[0][0])),
SFARRAY(&block_cr[0][0], sizeof(block_cr) / sizeof(block_cr[0][0])),
SFVAR(Control),
SFVAR(Command),
SFVAR(InCommand),
SFARRAY(&QMatrix[0][0], sizeof(QMatrix) / sizeof(QMatrix[0][0])),
SFVAR(QMIndex),
SFARRAY16(&IDCTMatrix[0], sizeof(IDCTMatrix) / sizeof(IDCTMatrix[0])),
SFVAR(IDCTMIndex),
SFVAR(QScale),
SFARRAY16(&Coeff[0], sizeof(Coeff) / sizeof(Coeff[0])),
SFVAR(CoeffIndex),
SFVAR(DecodeWB),
SFARRAY32(&PixelBuffer.pix32[0], sizeof(PixelBuffer.pix32) / sizeof(PixelBuffer.pix32[0])),
SFVAR(PixelBufferReadOffset),
SFVAR(PixelBufferCount32),
SFVAR(InCounter),
SFVAR(RAMOffsetY),
SFVAR(RAMOffsetCounter),
SFVAR(RAMOffsetWWS),
SFEND
};
int ret = MDFNSS_StateAction(sm, load, data_only, StateRegs, "MDEC");
if(load)
{
InFIFO.SaveStatePostLoad();
OutFIFO.SaveStatePostLoad();
}
return(ret);
}
static INLINE int8 Mask9ClampS8(int32 v)
{
v = sign_x_to_s32(9, v);
if(v < -128)
v = -128;
if(v > 127)
v = 127;
return v;
}
template<typename T>
static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
#if defined(__SSE2__)
{
for(unsigned col = 0; col < 8; col++)
{
__m128i c = _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);
for(unsigned x = 0; x < 8; x++)
{
__m128i sum;
__m128i m;
int32 tmp[4] MDFN_ALIGN(16);
m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
sum = _mm_madd_epi16(m, c);
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));
//_mm_store_ss((float *)&tmp[0], (__m128)sum);
_mm_store_si128((__m128i*)tmp, sum);
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
}
}
}
#else
for(unsigned col = 0; col < 8; col++)
{
for(unsigned x = 0; x < 8; x++)
{
int32 sum = 0;
for(unsigned u = 0; u < 8; u++)
{
sum += (in_coeff[(col * 8) + u] * IDCTMatrix[(x * 8) + u]);
}
if(sizeof(T) == 1)
out_coeff[(col * 8) + x] = Mask9ClampS8((sum + 0x4000) >> 15);
else
out_coeff[(x * 8) + col] = (sum + 0x4000) >> 15;
}
}
#endif
}
static void IDCT(int16 *in_coeff, int8 *out_coeff)
{
int16 tmpbuf[64] MDFN_ALIGN(16);
IDCT_1D_Multi<int16>(in_coeff, tmpbuf);
IDCT_1D_Multi<int8>(tmpbuf, out_coeff);
}
static INLINE void YCbCr_to_RGB(const int8 y, const int8 cb, const int8 cr, int &r, int &g, int &b)
{
// The formula for green is still a bit off(precision/rounding issues when both cb and cr are non-zero).
r = Mask9ClampS8(y + (((359 * cr) + 0x80) >> 8));
//g = Mask9ClampS8(y + (((-88 * cb) + (-183 * cr) + 0x80) >> 8));
g = Mask9ClampS8(y + ((((-88 * cb) &~ 0x1F) + ((-183 * cr) &~ 0x07) + 0x80) >> 8));
b = Mask9ClampS8(y + (((454 * cb) + 0x80) >> 8));
r ^= 0x80;
g ^= 0x80;
b ^= 0x80;
}
static INLINE uint16 RGB_to_RGB555(uint8 r, uint8 g, uint8 b)
{
r = (r + 4) >> 3;
g = (g + 4) >> 3;
b = (b + 4) >> 3;
if(r > 0x1F)
r = 0x1F;
if(g > 0x1F)
g = 0x1F;
if(b > 0x1F)
b = 0x1F;
return((r << 0) | (g << 5) | (b << 10));
}
static void EncodeImage(const unsigned ybn)
{
//printf("ENCODE, %d\n", (Command & 0x08000000) ? 256 : 384);
PixelBufferCount32 = 0;
switch((Command >> 27) & 0x3)
{
case 0: // 4bpp
{
const uint8 us_xor = (Command & (1U << 26)) ? 0x00 : 0x88;
uint8* pix_out = PixelBuffer.pix8;
for(int y = 0; y < 8; y++)
{
for(int x = 0; x < 8; x += 2)
{
uint8 p0 = std::min<int>(127, block_y[y][x + 0] + 8);
uint8 p1 = std::min<int>(127, block_y[y][x + 1] + 8);
*pix_out = ((p0 >> 4) | (p1 & 0xF0)) ^ us_xor;
pix_out++;
}
}
PixelBufferCount32 = 8;
}
break;
case 1: // 8bpp
{
const uint8 us_xor = (Command & (1U << 26)) ? 0x00 : 0x80;
uint8* pix_out = PixelBuffer.pix8;
for(int y = 0; y < 8; y++)
{
for(int x = 0; x < 8; x++)
{
*pix_out = (uint8)block_y[y][x] ^ us_xor;
pix_out++;
}
}
PixelBufferCount32 = 16;
}
break;
case 2: // 24bpp
{
const uint8 rgb_xor = (Command & (1U << 26)) ? 0x80 : 0x00;
uint8* pix_out = PixelBuffer.pix8;
for(int y = 0; y < 8; y++)
{
const int8* by = &block_y[y][0];
const int8* cb = &block_cb[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2];
const int8* cr = &block_cr[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2];
for(int x = 0; x < 8; x++)
{
int r, g, b;
YCbCr_to_RGB(by[x], cb[x >> 1], cr[x >> 1], r, g, b);
pix_out[0] = r ^ rgb_xor;
pix_out[1] = g ^ rgb_xor;
pix_out[2] = b ^ rgb_xor;
pix_out += 3;
}
}
PixelBufferCount32 = 48;
}
break;
case 3: // 16bpp
{
uint16 pixel_xor = ((Command & 0x02000000) ? 0x8000 : 0x0000) | ((Command & (1U << 26)) ? 0x4210 : 0x0000);
uint16* pix_out = PixelBuffer.pix16;
for(int y = 0; y < 8; y++)
{
const int8* by = &block_y[y][0];
const int8* cb = &block_cb[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2];
const int8* cr = &block_cr[(y >> 1) | ((ybn & 2) << 1)][(ybn & 1) << 2];
for(int x = 0; x < 8; x++)
{
int r, g, b;
YCbCr_to_RGB(by[x], cb[x >> 1], cr[x >> 1], r, g, b);
StoreU16_LE(pix_out, pixel_xor ^ RGB_to_RGB555(r, g, b));
pix_out++;
}
}
PixelBufferCount32 = 32;
}
break;
}
}
static INLINE void WriteImageData(uint16 V, int32* eat_cycles)
{
const uint32 qmw = (bool)(DecodeWB < 2);
//printf("MDEC DMA SubWrite: %04x, %d\n", V, CoeffIndex);
if(!CoeffIndex)
{
if(V == 0xFE00)
{
//printf("FE00 @ %u\n", DecodeWB);
return;
}
QScale = V >> 10;
{
int q = QMatrix[qmw][0]; // No QScale here!
int ci = sign_10_to_s16(V & 0x3FF);
int tmp;
if(q != 0)
tmp = (int32)((uint32)(ci * q) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
else
tmp = (uint32)(ci * 2) << 4;
// Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
Coeff[ZigZag[0]] = std::min<int>(0x3FFF, std::max<int>(-0x4000, tmp));
CoeffIndex++;
}
}
else
{
if(V == 0xFE00)
{
while(CoeffIndex < 64)
Coeff[ZigZag[CoeffIndex++]] = 0;
}
else
{
uint32 rlcount = V >> 10;
for(uint32 i = 0; i < rlcount && CoeffIndex < 64; i++)
{
Coeff[ZigZag[CoeffIndex]] = 0;
CoeffIndex++;
}
if(CoeffIndex < 64)
{
int q = QScale * QMatrix[qmw][CoeffIndex];
int ci = sign_10_to_s16(V & 0x3FF);
int tmp;
if(q != 0)
tmp = (int32)((uint32)((ci * q) >> 3) << 4) + (ci ? ((ci < 0) ? 8 : -8) : 0);
else
tmp = (uint32)(ci * 2) << 4;
// Not sure if it should be 0x3FFF or 0x3FF0 or maybe 0x3FF8?
Coeff[ZigZag[CoeffIndex]] = std::min<int>(0x3FFF, std::max<int>(-0x4000, tmp));
CoeffIndex++;
}
}
}
if(CoeffIndex == 64)
{
CoeffIndex = 0;
//printf("Block %d finished\n", DecodeWB);
switch(DecodeWB)
{
case 0:
IDCT(Coeff, &block_cr[0][0]);
break;
case 1:
IDCT(Coeff, &block_cb[0][0]);
break;
case 2:
IDCT(Coeff, &block_y[0][0]);
break;
case 3:
IDCT(Coeff, &block_y[0][0]);
break;
case 4:
IDCT(Coeff, &block_y[0][0]);
break;
case 5:
IDCT(Coeff, &block_y[0][0]);
break;
}
// Timing in the actual PS1 MDEC is complex due to (apparent) pipelining, but the average when decoding a large number of blocks is
// about 512. We'll go with a lower value here to be conservative due to timing granularity and other timing deficiencies in Mednafen. BUT, don't
// go lower than 460, or Parasite Eve 2's 3D models will stutter like crazy during FMV-background sequences.
//
*eat_cycles += 474;
if(DecodeWB >= 2)
{
EncodeImage((DecodeWB + 4) % 6);
}
DecodeWB++;
if(DecodeWB == (((Command >> 27) & 2) ? 6 : 3))
{
DecodeWB = ((Command >> 27) & 2) ? 0 : 2;
}
}
}
void MDEC_Run(int32 clocks)
{
static const unsigned MDRPhaseBias = 0 + 1;
//MDFN_DispMessage("%u", OutFIFO.CanRead());
ClockCounter += clocks;
if(ClockCounter > 128)
{
//if(MDRPhase != 0)
// printf("SNORT: %d\n", ClockCounter);
ClockCounter = 128;
}
switch(MDRPhase + MDRPhaseBias)
{
for(;;)
{
InCommand = false;
{ { case 1: if(!(InFIFO.CanRead())) { MDRPhase = 2 - MDRPhaseBias - 1; return; } }; Command = InFIFO.Read(); };
InCommand = true;
{ ClockCounter -= (1); { case 3: if(!(ClockCounter > 0)) { MDRPhase = 4 - MDRPhaseBias - 1; return; } }; };
//printf("****************** Command: %08x, %02x\n", Command, Command >> 29);
//
//
//
if(((Command >> 29) & 0x7) == 1)
{
InCounter = Command & 0xFFFF;
OutFIFO.Flush();
//OutBuffer.Flush();
PixelBufferCount32 = 0;
CoeffIndex = 0;
if((Command >> 27) & 2)
DecodeWB = 0;
else
DecodeWB = 2;
switch((Command >> 27) & 0x3)
{
case 0:
case 1: RAMOffsetWWS = 0; break;
case 2: RAMOffsetWWS = 6; break;
case 3: RAMOffsetWWS = 4; break;
}
RAMOffsetY = 0;
RAMOffsetCounter = RAMOffsetWWS;
InCounter--;
do
{
uint32 tfr;
int32 need_eat; // = 0;
{ { case 5: if(!(InFIFO.CanRead())) { MDRPhase = 6 - MDRPhaseBias - 1; return; } }; tfr = InFIFO.Read(); };
InCounter--;
// printf("KA: %04x %08x\n", InCounter, tfr);
need_eat = 0;
PixelBufferCount32 = 0;
WriteImageData(tfr, &need_eat);
WriteImageData(tfr >> 16, &need_eat);
{ ClockCounter -= (need_eat); { case 7: if(!(ClockCounter > 0)) { MDRPhase = 8 - MDRPhaseBias - 1; return; } }; };
PixelBufferReadOffset = 0;
while(PixelBufferReadOffset != PixelBufferCount32)
{
{ { case 9: if(!(OutFIFO.CanWrite())) { MDRPhase = 10 - MDRPhaseBias - 1; return; } }; OutFIFO.Write(LoadU32_LE(&PixelBuffer.pix32[PixelBufferReadOffset++])); };
}
} while(InCounter != 0xFFFF);
}
//
//
//
else if(((Command >> 29) & 0x7) == 2)
{
QMIndex = 0;
InCounter = 0x10 + ((Command & 0x1) ? 0x10 : 0x00);
InCounter--;
do
{
uint32 tfr;
{ { case 11: if(!(InFIFO.CanRead())) { MDRPhase = 12 - MDRPhaseBias - 1; return; } }; tfr = InFIFO.Read(); };
InCounter--;
//printf("KA: %04x %08x\n", InCounter, tfr);
for(int i = 0; i < 4; i++)
{
QMatrix[QMIndex >> 6][QMIndex & 0x3F] = (uint8)tfr;
QMIndex = (QMIndex + 1) & 0x7F;
tfr >>= 8;
}
} while(InCounter != 0xFFFF);
}
//
//
//
else if(((Command >> 29) & 0x7) == 3)
{
IDCTMIndex = 0;
InCounter = 0x20;
InCounter--;
do
{
uint32 tfr;
{ { case 13: if(!(InFIFO.CanRead())) { MDRPhase = 14 - MDRPhaseBias - 1; return; } }; tfr = InFIFO.Read(); };
InCounter--;
for(unsigned i = 0; i < 2; i++)
{
IDCTMatrix[((IDCTMIndex & 0x7) << 3) | ((IDCTMIndex >> 3) & 0x7)] = (int16)(tfr & 0xFFFF) >> 3;
IDCTMIndex = (IDCTMIndex + 1) & 0x3F;
tfr >>= 16;
}
} while(InCounter != 0xFFFF);
}
else
{
InCounter = Command & 0xFFFF;
}
} // end for(;;)
}
}
void MDEC_DMAWrite(uint32 V)
{
if(!InFIFO.CanWrite())
return;
InFIFO.Write(V);
MDEC_Run(0);
}
uint32 MDEC_DMARead(uint32* offs)
{
uint32 V = 0;
*offs = 0;
if(MDFN_LIKELY(OutFIFO.CanRead()))
{
V = OutFIFO.Read();
*offs = (RAMOffsetY & 0x7) * RAMOffsetWWS;
if(RAMOffsetY & 0x08)
{
*offs = (*offs - RAMOffsetWWS*7);
}
RAMOffsetCounter--;
if(!RAMOffsetCounter)
{
RAMOffsetCounter = RAMOffsetWWS;
RAMOffsetY++;
}
MDEC_Run(0);
}
return(V);
}
bool MDEC_DMACanWrite(void)
{
return((InFIFO.CanWrite() >= 0x20) && (Control & (1U << 30)) && InCommand && InCounter != 0xFFFF);
}
bool MDEC_DMACanRead(void)
{
return((OutFIFO.CanRead() >= 0x20) && (Control & (1U << 29)));
}
void MDEC_Write(const int32_t timestamp, uint32 A, uint32 V)
{
//PSX_WARNING("[MDEC] Write: 0x%08x 0x%08x, %d --- %u %u", A, V, timestamp, InFIFO.CanRead(), OutFIFO.CanRead());
if(A & 4)
{
if(V & 0x80000000) // Reset?
{
MDRPhase = 0;
InCounter = 0;
Command = 0;
InCommand = false;
PixelBufferCount32 = 0;
ClockCounter = 0;
QMIndex = 0;
IDCTMIndex = 0;
QScale = 0;
memset(Coeff, 0, sizeof(Coeff));
CoeffIndex = 0;
DecodeWB = 0;
InFIFO.Flush();
OutFIFO.Flush();
}
Control = V & 0x7FFFFFFF;
}
else
{
if(InFIFO.CanWrite())
{
InFIFO.Write(V);
if(!InCommand)
{
if(ClockCounter < 1)
ClockCounter = 1;
}
MDEC_Run(0);
}
}
}
uint32 MDEC_Read(const int32_t timestamp, uint32 A)
{
uint32 ret = 0;
if(A & 4)
{
ret = 0;
ret |= (OutFIFO.CanRead() == 0) << 31;
ret |= (InFIFO.CanWrite() == 0) << 30;
ret |= InCommand << 29;
ret |= MDEC_DMACanWrite() << 28;
ret |= MDEC_DMACanRead() << 27;
ret |= ((Command >> 25) & 0xF) << 23;
// Needs refactoring elsewhere to work right: ret |= ((DecodeWB + 4) % 6) << 16;
ret |= InCounter & 0xFFFF;
}
else
{
if(OutFIFO.CanRead())
ret = OutFIFO.Read();
}
//PSX_WARNING("[MDEC] Read: 0x%08x 0x%08x -- %d %d", A, ret, InputBuffer.CanRead(), InCounter);
return(ret);
}