ppsspp/GPU/Common/VertexDecoderCommon.h

// Copyright (c) 2012- PPSSPP Project.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License 2.0 for more details.

// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/

// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

#pragma once

#include <cstring>

#include "ppsspp_config.h"

#include "Common/CommonTypes.h"
#include "Common/Data/Collections/Hashmaps.h"
#include "Common/Data/Convert/SmallDataConvert.h"
#include "Common/Log.h"
#include "Common/LogReporting.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/ShaderCommon.h"
#include "GPU/GPUCommon.h"
#include "GPU/GPUState.h"

#if PPSSPP_ARCH(ARM)
#include "Common/ArmEmitter.h"
#elif PPSSPP_ARCH(ARM64)
#include "Common/Arm64Emitter.h"
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include "Common/x64Emitter.h"
#elif PPSSPP_ARCH(RISCV64)
#include "Common/RiscVEmitter.h"
#else
#include "Common/FakeEmitter.h"
#endif

// DecVtxFormat - vertex formats for PC
// Kind of like a D3D VertexDeclaration.
// Can write code to easily bind these using OpenGL, or read these manually.
// No morph support, that is taken care of by the VertexDecoder.

// Keep this in 4 bits.
enum {
	DEC_NONE,
	DEC_FLOAT_1,
	DEC_FLOAT_2,
	DEC_FLOAT_3,
	DEC_FLOAT_4,
	DEC_S8_3,
	DEC_S16_3,
	DEC_U8_1,
	DEC_U8_2,
	DEC_U8_3,
	DEC_U8_4,
	DEC_U16_1,
	DEC_U16_2,
	DEC_U16_3,
	DEC_U16_4,
};

struct DecVtxFormat {
	u8 w0fmt; u8 w0off;  // first 4 weights
	u8 w1fmt; u8 w1off;  // second 4 weights
	u8 uvfmt; u8 uvoff;
	u8 c0fmt; u8 c0off;  // First color
	u8 c1fmt; u8 c1off;
	u8 nrmfmt; u8 nrmoff;
	u8 posoff;  // Output position format is always DEC_FLOAT_3.
	u8 stride;

	uint32_t id;
	void ComputeID();
	void InitializeFromID(uint32_t id);

	static u8 PosFmt() { return DEC_FLOAT_3; }
};

void GetIndexBounds(const void *inds, int count, u32 vertType, u16 *indexLowerBound, u16 *indexUpperBound);

inline int RoundUp4(int x) {
	return (x + 3) & ~3;
}

class IndexConverter {
private:
	union {
		const void *indices;
		const u8 *indices8;
		const u16_le *indices16;
		const u32_le *indices32;
	};
	u32 indexType;

public:
	IndexConverter(u32 vertType, const void *indices)
		: indices(indices), indexType(vertType & GE_VTYPE_IDX_MASK) {
	}

	u32 operator() (u32 index) const {
		switch (indexType) {
		case GE_VTYPE_IDX_8BIT:
			return indices8[index];
		case GE_VTYPE_IDX_16BIT:
			return indices16[index];
		case GE_VTYPE_IDX_32BIT:
			return indices32[index];
		default:
			return index;
		}
	}
};

// Reads decoded vertex formats in a convenient way. For software transform and debugging.
class VertexReader {
public:
	VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}

	void ReadPos(float pos[3]) const {
		// Only DEC_FLOAT_3 is supported.
		const float *f = (const float *)(data_ + decFmt_.posoff);
		pos[0] = f[0];
		pos[1] = f[1];
		if (!isThrough()) {
			pos[2] = f[2];
		} else {
			// Integer value passed in a float. Clamped to 0, 65535.
			pos[2] = (int)f[2] * (1.0f / 65535.0f);
		}
	}

	void ReadPosThroughZ16(float pos[3]) const {
		// Only DEC_FLOAT_3 is supported.
		const float *f = (const float *)(data_ + decFmt_.posoff);
		memcpy(pos, f, 12);
	}

	void ReadNrm(float nrm[3]) const {
		switch (decFmt_.nrmfmt) {
		case DEC_FLOAT_3:
			//memcpy(nrm, data_ + decFmt_.nrmoff, 12);
			{
				const float *f = (const float *)(data_ + decFmt_.nrmoff);
				for (int i = 0; i < 3; i++)
					nrm[i] = f[i];
			}
			break;
		case DEC_S16_3:
			{
				const s16 *s = (const s16 *)(data_ + decFmt_.nrmoff);
				for (int i = 0; i < 3; i++)
					nrm[i] = s[i] * (1.f / 32767.f);
			}
			break;
		case DEC_S8_3:
			{
				const s8 *b = (const s8 *)(data_ + decFmt_.nrmoff);
				for (int i = 0; i < 3; i++)
					nrm[i] = b[i] * (1.f / 127.f);
			}
			break;
		default:
			memset(nrm, 0, sizeof(float) * 3);
			break;
		}
	}

	void ReadUV(float uv[2]) const {
		// Only DEC_FLOAT_2 is supported.
		const float *f = (const float *)(data_ + decFmt_.uvoff);
		uv[0] = f[0];
		uv[1] = f[1];
	}

	void ReadColor0(float color[4]) const {
		switch (decFmt_.c0fmt) {
		case DEC_U8_4:
			Uint8x4ToFloat4(color, *(const u32 *)(data_ + decFmt_.c0off));
			break;
		case DEC_FLOAT_4:
			memcpy(color, data_ + decFmt_.c0off, 16);
			break;
		default:
			memset(color, 0, sizeof(float) * 4);
			break;
		}
	}

	u32 ReadColor0_8888() const {
		switch (decFmt_.c0fmt) {
		case DEC_U8_4:
			{
				const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
				u32 value;
				memcpy(&value, b, 4);
				return value;
			}
			break;
		case DEC_FLOAT_4:
			{
				const float *f = (const float *)(data_ + decFmt_.c0off);
				return Float4ToUint8x4_NoClamp(f);
			}
			break;
		default:
			return 0;
		}
	}

	void ReadColor1(float color[3]) const {
		switch (decFmt_.c1fmt) {
		case DEC_U8_4:
			{
				const u8 *b = (const u8 *)(data_ + decFmt_.c1off);
				for (int i = 0; i < 3; i++)
					color[i] = b[i] * (1.f / 255.f);
			}
			break;
		case DEC_FLOAT_4:
			memcpy(color, data_ + decFmt_.c1off, 12);
			break;
		default:
			memset(color, 0, sizeof(float) * 3);
			break;
		}
	}

	void ReadWeights(float weights[8]) const {
		const float *f = (const float *)(data_ + decFmt_.w0off);
		const u8 *b = (const u8 *)(data_ + decFmt_.w0off);
		const u16 *s = (const u16 *)(data_ + decFmt_.w0off);
		switch (decFmt_.w0fmt) {
		case DEC_FLOAT_1:
		case DEC_FLOAT_2:
		case DEC_FLOAT_3:
		case DEC_FLOAT_4:
			for (int i = 0; i <= decFmt_.w0fmt - DEC_FLOAT_1; i++)
				weights[i] = f[i];
			break;
		case DEC_U8_1: weights[0] = b[0] * (1.f / 128.f); break;
		case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i] = b[i] * (1.f / 128.f); break;
		case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i] = b[i] * (1.f / 128.f); break;
		case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i] = b[i] * (1.f / 128.f); break;
		case DEC_U16_1: weights[0] = s[0] * (1.f / 32768.f); break;
		case DEC_U16_2: for (int i = 0; i < 2; i++) weights[i] = s[i] * (1.f / 32768.f); break;
		case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i] = s[i] * (1.f / 32768.f); break;
		case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i] = s[i] * (1.f / 32768.f); break;
		default:
			ERROR_LOG_REPORT_ONCE(fmtw0, G3D, "Reader: Unsupported W0 Format %d", decFmt_.w0fmt);
			memset(weights, 0, sizeof(float) * 8);
			break;
		}

		f = (const float *)(data_ + decFmt_.w1off);
		b = (const u8 *)(data_ + decFmt_.w1off);
		s = (const u16 *)(data_ + decFmt_.w1off);
		switch (decFmt_.w1fmt) {
		case 0:
			// It's fine for there to be w0 weights but not w1.
			break;
		case DEC_FLOAT_1:
		case DEC_FLOAT_2:
		case DEC_FLOAT_3:
		case DEC_FLOAT_4:
			for (int i = 0; i <= decFmt_.w1fmt - DEC_FLOAT_1; i++)
				weights[i+4] = f[i];
			break;
		case DEC_U8_1: weights[4] = b[0] * (1.f / 128.f); break;
		case DEC_U8_2: for (int i = 0; i < 2; i++) weights[i+4] = b[i] * (1.f / 128.f); break;
		case DEC_U8_3: for (int i = 0; i < 3; i++) weights[i+4] = b[i] * (1.f / 128.f); break;
		case DEC_U8_4: for (int i = 0; i < 4; i++) weights[i+4] = b[i] * (1.f / 128.f); break;
		case DEC_U16_1: weights[4] = s[0] * (1.f / 32768.f); break;
		case DEC_U16_2: for (int i = 0; i < 2; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
		case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
		case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i]  * (1.f / 32768.f); break;
		default:
			memset(weights + 4, 0, sizeof(float) * 4);
			break;
		}
	}

	bool hasColor0() const { return decFmt_.c0fmt != 0; }
	bool hasColor1() const { return decFmt_.c1fmt != 0; }
	bool hasNormal() const { return decFmt_.nrmfmt != 0; }
	bool hasUV() const { return decFmt_.uvfmt != 0; }
	bool isThrough() const { return (vtype_ & GE_VTYPE_THROUGH) != 0; }
	void Goto(int index) {
		data_ = base_ + index * decFmt_.stride;
	}

private:
	u8 *base_;
	u8 *data_;
	DecVtxFormat decFmt_;
	int vtype_;
};
// Debugging utilities
void PrintDecodedVertex(const VertexReader &vtx);


class VertexDecoder;
class VertexDecoderJitCache;

typedef void (VertexDecoder::*StepFunction)() const;
typedef void (VertexDecoderJitCache::*JitStepFunction)();

struct JitLookup {
	StepFunction func;
	JitStepFunction jitFunc;
};

// Collapse to less skinning shaders to reduce shader switching, which is expensive.
int TranslateNumBones(int bones);

typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count, const UVScale *uvScaleOffset);

struct VertexDecoderOptions {
	bool expandAllWeightsToFloat;
	bool expand8BitNormalsToFloat;
	bool applySkinInDecode;
	bool alignOutputToWord;
};

class VertexDecoder {
public:
	// A jit cache is not mandatory.
	void SetVertexType(u32 vtype, const VertexDecoderOptions &options, VertexDecoderJitCache *jitCache = nullptr);

	u32 VertexType() const { return fmt_; }

	const DecVtxFormat &GetDecVtxFmt() const { return decFmt; }

	void DecodeVerts(u8 *decoded, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const;

	int VertexSize() const { return size; }  // PSP format size

	std::string GetString(DebugShaderStringType stringType);

	void Step_WeightsU8() const;
	void Step_WeightsU16() const;
	void Step_WeightsU8ToFloat() const;
	void Step_WeightsU16ToFloat() const;
	void Step_WeightsFloat() const;

	void ComputeSkinMatrix(const float weights[8]) const;

	void Step_WeightsU8Skin() const;
	void Step_WeightsU16Skin() const;
	void Step_WeightsFloatSkin() const;

	void Step_TcU8ToFloat() const;
	void Step_TcU16ToFloat() const;
	void Step_TcFloat() const;

	void Step_TcU8Prescale() const;
	void Step_TcU16Prescale() const;
	void Step_TcU16DoublePrescale() const;
	void Step_TcFloatPrescale() const;

	void Step_TcU16DoubleToFloat() const;
	void Step_TcU16ThroughToFloat() const;
	void Step_TcU16ThroughDoubleToFloat() const;
	void Step_TcFloatThrough() const;

	void Step_TcU8MorphToFloat() const;
	void Step_TcU16MorphToFloat() const;
	void Step_TcU16DoubleMorphToFloat() const;
	void Step_TcFloatMorph() const;
	void Step_TcU8PrescaleMorph() const;
	void Step_TcU16PrescaleMorph() const;
	void Step_TcU16DoublePrescaleMorph() const;
	void Step_TcFloatPrescaleMorph() const;

	void Step_ColorInvalid() const;
	void Step_Color4444() const;
	void Step_Color565() const;
	void Step_Color5551() const;
	void Step_Color8888() const;

	void Step_Color4444Morph() const;
	void Step_Color565Morph() const;
	void Step_Color5551Morph() const;
	void Step_Color8888Morph() const;

	void Step_NormalS8() const;
	void Step_NormalS8ToFloat() const;
	void Step_NormalS16() const;
	void Step_NormalFloat() const;

	void Step_NormalS8Skin() const;
	void Step_NormalS16Skin() const;
	void Step_NormalFloatSkin() const;

	void Step_NormalS8Morph() const;
	void Step_NormalS16Morph() const;
	void Step_NormalFloatMorph() const;

	void Step_NormalS8MorphSkin() const;
	void Step_NormalS16MorphSkin() const;
	void Step_NormalFloatMorphSkin() const;

	void Step_PosS8() const;
	void Step_PosS16() const;
	void Step_PosFloat() const;

	void Step_PosS8Skin() const;
	void Step_PosS16Skin() const;
	void Step_PosFloatSkin() const;

	void Step_PosS8Morph() const;
	void Step_PosS16Morph() const;
	void Step_PosFloatMorph() const;

	void Step_PosS8MorphSkin() const;
	void Step_PosS16MorphSkin() const;
	void Step_PosFloatMorphSkin() const;

	void Step_PosInvalid() const;
	void Step_PosS8Through() const;
	void Step_PosS16Through() const;
	void Step_PosFloatThrough() const;

	// output must be big for safety.
	// Returns number of chars written.
	// Ugly for speed.
	int ToString(char *output) const;

	// Mutable decoder state
	mutable u8 *decoded_ = nullptr;
	mutable const u8 *ptr_ = nullptr;
	mutable const UVScale *prescaleUV_ = nullptr;
	JittedVertexDecoder jitted_ = 0;
	int32_t jittedSize_ = 0;

	// "Immutable" state, set at startup

	// The decoding steps. Never more than 5.
	StepFunction steps_[5];
	int numSteps_;

	u32 fmt_;
	DecVtxFormat decFmt;

	bool throughmode;
	bool skinInDecode;
	// With morph and weights, this can be more than 256 bytes.
	u16 size;
	u8 onesize_;

	u8 weightoff;
	u8 tcoff;
	u8 coloff;
	u8 nrmoff;
	u8 posoff;

	u8 tc;
	u8 col;
	u8 nrm;
	u8 pos;
	u8 weighttype;
	u8 idx;
	u8 morphcount;
	u8 nweights;

	u8 biggest;  // in practice, alignment.

	friend class VertexDecoderJitCache;

private:
	void CompareToJit(const u8 *startPtr, u8 *decodedptr, int count, const UVScale *uvScaleOffset) const;
};


// A compiled vertex decoder takes the following arguments (C calling convention):
// u8 *src, u8 *dst, int count
//
// x86:
//   src is placed in esi and dst in edi
//   for every vertex, we step esi and edi forwards by the two vertex sizes
//   all movs are done relative to esi and edi
//
// that's it!

#if PPSSPP_ARCH(ARM)
#define VERTEXDECODER_JIT_BACKEND ArmGen::ARMXCodeBlock
#elif PPSSPP_ARCH(ARM64)
#define VERTEXDECODER_JIT_BACKEND Arm64Gen::ARM64CodeBlock
#elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#define VERTEXDECODER_JIT_BACKEND Gen::XCodeBlock
#elif PPSSPP_ARCH(RISCV64)
#define VERTEXDECODER_JIT_BACKEND RiscVGen::RiscVCodeBlock
#endif


#ifdef VERTEXDECODER_JIT_BACKEND
class VertexDecoderJitCache : public VERTEXDECODER_JIT_BACKEND {
public:
	VertexDecoderJitCache();

	// Returns a pointer to the code to run.
	JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize);
	void Clear();

	void Jit_WeightsU8();
	void Jit_WeightsU16();
	void Jit_WeightsU8ToFloat();
	void Jit_WeightsU16ToFloat();
	void Jit_WeightsFloat();

	void Jit_WeightsU8Skin();
	void Jit_WeightsU16Skin();
	void Jit_WeightsFloatSkin();

	void Jit_TcU8ToFloat();
	void Jit_TcU16ToFloat();
	void Jit_TcFloat();

	void Jit_TcU8Prescale();
	void Jit_TcU16Prescale();
	void Jit_TcFloatPrescale();

	void Jit_TcAnyMorph(int bits);
	void Jit_TcU8MorphToFloat();
	void Jit_TcU16MorphToFloat();
	void Jit_TcFloatMorph();
	void Jit_TcU8PrescaleMorph();
	void Jit_TcU16PrescaleMorph();
	void Jit_TcFloatPrescaleMorph();

	void Jit_TcU16ThroughToFloat();
	void Jit_TcFloatThrough();

	void Jit_Color8888();
	void Jit_Color4444();
	void Jit_Color565();
	void Jit_Color5551();

	void Jit_NormalS8();
	void Jit_NormalS8ToFloat();
	void Jit_NormalS16();
	void Jit_NormalFloat();

	void Jit_NormalS8Skin();
	void Jit_NormalS16Skin();
	void Jit_NormalFloatSkin();

	void Jit_PosS8();
	void Jit_PosS16();
	void Jit_PosFloat();
	void Jit_PosS8Through();
	void Jit_PosS16Through();
	void Jit_PosFloatThrough();

	void Jit_PosS8Skin();
	void Jit_PosS16Skin();
	void Jit_PosFloatSkin();

	void Jit_NormalS8Morph();
	void Jit_NormalS16Morph();
	void Jit_NormalFloatMorph();

	void Jit_NormalS8MorphSkin();
	void Jit_NormalS16MorphSkin();
	void Jit_NormalFloatMorphSkin();

	void Jit_PosS8Morph();
	void Jit_PosS16Morph();
	void Jit_PosFloatMorph();

	void Jit_PosS8MorphSkin();
	void Jit_PosS16MorphSkin();
	void Jit_PosFloatMorphSkin();

	void Jit_Color8888Morph();
	void Jit_Color4444Morph();
	void Jit_Color565Morph();
	void Jit_Color5551Morph();

private:
	bool CompileStep(const VertexDecoder &dec, int i);
	void Jit_ApplyWeights();
	void Jit_WriteMatrixMul(int outOff, bool pos);
	void Jit_WriteMorphColor(int outOff, bool checkAlpha = true);
	void Jit_AnyS8ToFloat(int srcoff);
	void Jit_AnyS16ToFloat(int srcoff);
	void Jit_AnyU8ToFloat(int srcoff, u32 bits = 32);
	void Jit_AnyU16ToFloat(int srcoff, u32 bits = 64);
	void Jit_AnyS8Morph(int srcoff, int dstoff);
	void Jit_AnyS16Morph(int srcoff, int dstoff);
	void Jit_AnyFloatMorph(int srcoff, int dstoff);

	const VertexDecoder *dec_ = nullptr;
#if PPSSPP_ARCH(ARM64)
	Arm64Gen::ARM64FloatEmitter fp;
#endif
};
#else
class VertexDecoderJitCache : public FakeGen::FakeXCodeBlock {
public:
	VertexDecoderJitCache();

	JittedVertexDecoder Compile(const VertexDecoder &dec, int32_t *jittedSize) {
		return nullptr;
	}
	void Clear();
};
#endif