mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-26 23:10:38 +00:00
Pass uvScale in as an argument to the vertex decoder
Cleaner than overwriting/restoring gstate_c.uvScale in the decoder loop. A small cleanup I've been wanting to do for ages. Expecting a negligble perf boost if any.
This commit is contained in:
parent
17a723e68c
commit
01cea7f088
@ -103,12 +103,9 @@ int DrawEngineCommon::ComputeNumVertsToDecode() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void DrawEngineCommon::DecodeVerts(u8 *dest) {
|
void DrawEngineCommon::DecodeVerts(u8 *dest) {
|
||||||
const UVScale origUV = gstate_c.uv;
|
|
||||||
for (; decodeCounter_ < numDrawCalls_; decodeCounter_++) {
|
for (; decodeCounter_ < numDrawCalls_; decodeCounter_++) {
|
||||||
gstate_c.uv = drawCalls_[decodeCounter_].uvScale;
|
DecodeVertsStep(dest, decodeCounter_, decodedVerts_, &drawCalls_[decodeCounter_].uvScale); // NOTE! DecodeVertsStep can modify decodeCounter_!
|
||||||
DecodeVertsStep(dest, decodeCounter_, decodedVerts_); // NOTE! DecodeVertsStep can modify decodeCounter_!
|
|
||||||
}
|
}
|
||||||
gstate_c.uv = origUV;
|
|
||||||
|
|
||||||
// Sanity check
|
// Sanity check
|
||||||
if (indexGen.Prim() < 0) {
|
if (indexGen.Prim() < 0) {
|
||||||
@ -505,7 +502,7 @@ bool DrawEngineCommon::GetCurrentSimpleVertices(int count, std::vector<GPUDebugV
|
|||||||
u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, VertexDecoder *dec, int lowerBound, int upperBound, u32 vertType) {
|
u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, VertexDecoder *dec, int lowerBound, int upperBound, u32 vertType) {
|
||||||
// First, decode the vertices into a GPU compatible format. This step can be eliminated but will need a separate
|
// First, decode the vertices into a GPU compatible format. This step can be eliminated but will need a separate
|
||||||
// implementation of the vertex decoder.
|
// implementation of the vertex decoder.
|
||||||
dec->DecodeVerts(bufPtr, inPtr, lowerBound, upperBound);
|
dec->DecodeVerts(bufPtr, inPtr, &gstate_c.uv, lowerBound, upperBound);
|
||||||
|
|
||||||
// OK, morphing eliminated but bones still remain to be taken care of.
|
// OK, morphing eliminated but bones still remain to be taken care of.
|
||||||
// Let's do a partial software transform where we only do skinning.
|
// Let's do a partial software transform where we only do skinning.
|
||||||
@ -612,7 +609,7 @@ void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) {
|
|||||||
gstate_c.Dirty(DIRTY_SHADERBLEND);
|
gstate_c.Dirty(DIRTY_SHADERBLEND);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
|
void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale) {
|
||||||
PROFILE_THIS_SCOPE("vertdec");
|
PROFILE_THIS_SCOPE("vertdec");
|
||||||
|
|
||||||
const DeferredDrawCall &dc = drawCalls_[i];
|
const DeferredDrawCall &dc = drawCalls_[i];
|
||||||
@ -624,7 +621,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
|
|||||||
if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
|
if (dc.indexType == GE_VTYPE_IDX_NONE >> GE_VTYPE_IDX_SHIFT) {
|
||||||
// Decode the verts (and at the same time apply morphing/skinning). Simple.
|
// Decode the verts (and at the same time apply morphing/skinning). Simple.
|
||||||
dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
|
dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
|
||||||
dc.verts, indexLowerBound, indexUpperBound);
|
dc.verts, uvScale, indexLowerBound, indexUpperBound);
|
||||||
decodedVerts += indexUpperBound - indexLowerBound + 1;
|
decodedVerts += indexUpperBound - indexLowerBound + 1;
|
||||||
|
|
||||||
bool clockwise = true;
|
bool clockwise = true;
|
||||||
@ -691,7 +688,7 @@ void DrawEngineCommon::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
|
|||||||
|
|
||||||
// 3. Decode that range of vertex data.
|
// 3. Decode that range of vertex data.
|
||||||
dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
|
dec_->DecodeVerts(dest + decodedVerts * (int)dec_->GetDecVtxFmt().stride,
|
||||||
dc.verts, indexLowerBound, indexUpperBound);
|
dc.verts, uvScale, indexLowerBound, indexUpperBound);
|
||||||
decodedVerts += vertexCount;
|
decodedVerts += vertexCount;
|
||||||
|
|
||||||
// 4. Advance indexgen vertex counter.
|
// 4. Advance indexgen vertex counter.
|
||||||
@ -849,7 +846,7 @@ void DrawEngineCommon::SubmitPrim(const void *verts, const void *inds, GEPrimiti
|
|||||||
vertexCountInDrawCalls_ += vertexCount;
|
vertexCountInDrawCalls_ += vertexCount;
|
||||||
|
|
||||||
if (decOptions_.applySkinInDecode && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) {
|
if (decOptions_.applySkinInDecode && (vertTypeID & GE_VTYPE_WEIGHT_MASK)) {
|
||||||
DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_);
|
DecodeVertsStep(decoded_, decodeCounter_, decodedVerts_, &dc.uvScale);
|
||||||
decodeCounter_++;
|
decodeCounter_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ protected:
|
|||||||
uint64_t ComputeHash();
|
uint64_t ComputeHash();
|
||||||
|
|
||||||
// Vertex decoding
|
// Vertex decoding
|
||||||
void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts);
|
void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts, const UVScale *uvScale);
|
||||||
|
|
||||||
void ApplyFramebufferRead(FBOTexState *fboTexState);
|
void ApplyFramebufferRead(FBOTexState *fboTexState);
|
||||||
|
|
||||||
|
@ -190,7 +190,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
|
|
||||||
// Keep the scale/offset in a few fp registers if we need it.
|
// Keep the scale/offset in a few fp registers if we need it.
|
||||||
if (prescaleStep) {
|
if (prescaleStep) {
|
||||||
MOVP2R(R3, &gstate_c.uv);
|
|
||||||
VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE);
|
VLD1(F_32, neonUVScaleReg, R3, 2, ALIGN_NONE);
|
||||||
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
||||||
VMOV_neon(F_32, neonScratchReg, by128);
|
VMOV_neon(F_32, neonScratchReg, by128);
|
||||||
|
@ -39,6 +39,9 @@ static const ARM64Reg srcReg = X0;
|
|||||||
static const ARM64Reg dstReg = X1;
|
static const ARM64Reg dstReg = X1;
|
||||||
|
|
||||||
static const ARM64Reg counterReg = W2;
|
static const ARM64Reg counterReg = W2;
|
||||||
|
|
||||||
|
static const ARM64Reg uvScaleReg = X3;
|
||||||
|
|
||||||
static const ARM64Reg tempReg1 = W3;
|
static const ARM64Reg tempReg1 = W3;
|
||||||
static const ARM64Reg tempRegPtr = X3;
|
static const ARM64Reg tempRegPtr = X3;
|
||||||
static const ARM64Reg tempReg2 = W4;
|
static const ARM64Reg tempReg2 = W4;
|
||||||
@ -175,7 +178,6 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
|
|
||||||
// Keep the scale/offset in a few fp registers if we need it.
|
// Keep the scale/offset in a few fp registers if we need it.
|
||||||
if (prescaleStep) {
|
if (prescaleStep) {
|
||||||
MOVP2R(X3, &gstate_c.uv);
|
|
||||||
fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
|
fp.LDR(64, INDEX_UNSIGNED, neonUVScaleReg, X3, 0);
|
||||||
fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
|
fp.LDR(64, INDEX_UNSIGNED, neonUVOffsetReg, X3, 8);
|
||||||
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
||||||
|
@ -1282,11 +1282,10 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
|
void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const {
|
||||||
// Decode the vertices within the found bounds, once each
|
// Decode the vertices within the found bounds, once each
|
||||||
// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
|
// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
|
||||||
decoded_ = decodedptr;
|
const u8 *startPtr = (const u8*)verts + indexLowerBound * size;
|
||||||
ptr_ = (const u8*)verts + indexLowerBound * size;
|
|
||||||
|
|
||||||
int count = indexUpperBound - indexLowerBound + 1;
|
int count = indexUpperBound - indexLowerBound + 1;
|
||||||
int stride = decFmt.stride;
|
int stride = decFmt.stride;
|
||||||
@ -1300,8 +1299,10 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
|
|||||||
|
|
||||||
if (jitted_) {
|
if (jitted_) {
|
||||||
// We've compiled the steps into optimized machine code, so just jump!
|
// We've compiled the steps into optimized machine code, so just jump!
|
||||||
jitted_(ptr_, decoded_, count);
|
jitted_(startPtr, decodedptr, count, uvScaleOffset);
|
||||||
} else {
|
} else {
|
||||||
|
ptr_ = startPtr;
|
||||||
|
decoded_ = decodedptr;
|
||||||
// Interpret the decode steps
|
// Interpret the decode steps
|
||||||
for (; count; count--) {
|
for (; count; count--) {
|
||||||
for (int i = 0; i < numSteps_; i++) {
|
for (int i = 0; i < numSteps_; i++) {
|
||||||
|
@ -320,7 +320,7 @@ struct JitLookup {
|
|||||||
// Collapse to less skinning shaders to reduce shader switching, which is expensive.
|
// Collapse to less skinning shaders to reduce shader switching, which is expensive.
|
||||||
int TranslateNumBones(int bones);
|
int TranslateNumBones(int bones);
|
||||||
|
|
||||||
typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
|
typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count, const UVScale *uvScaleOffset);
|
||||||
|
|
||||||
struct VertexDecoderOptions {
|
struct VertexDecoderOptions {
|
||||||
bool expandAllWeightsToFloat;
|
bool expandAllWeightsToFloat;
|
||||||
@ -338,7 +338,7 @@ public:
|
|||||||
|
|
||||||
const DecVtxFormat &GetDecVtxFmt() const { return decFmt; }
|
const DecVtxFormat &GetDecVtxFmt() const { return decFmt; }
|
||||||
|
|
||||||
void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
|
void DecodeVerts(u8 *decoded, const void *verts, const UVScale *uvScaleOffset, int indexLowerBound, int indexUpperBound) const;
|
||||||
|
|
||||||
int VertexSize() const { return size; } // PSP format size
|
int VertexSize() const { return size; } // PSP format size
|
||||||
|
|
||||||
|
@ -33,11 +33,11 @@ static const float const65535 = 65535.0f;
|
|||||||
|
|
||||||
using namespace RiscVGen;
|
using namespace RiscVGen;
|
||||||
|
|
||||||
static const RiscVReg srcReg = X10;
|
static const RiscVReg srcReg = X10; // a0
|
||||||
static const RiscVReg dstReg = X11;
|
static const RiscVReg dstReg = X11; // a1
|
||||||
static const RiscVReg counterReg = X12;
|
static const RiscVReg counterReg = X12; // a2
|
||||||
|
|
||||||
static const RiscVReg tempReg1 = X13;
|
static const RiscVReg tempReg1 = X13; // a3
|
||||||
static const RiscVReg tempReg2 = X14;
|
static const RiscVReg tempReg2 = X14;
|
||||||
static const RiscVReg tempReg3 = X15;
|
static const RiscVReg tempReg3 = X15;
|
||||||
static const RiscVReg scratchReg = X16;
|
static const RiscVReg scratchReg = X16;
|
||||||
@ -234,7 +234,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
|
|
||||||
// Keep the scale/offset in a few fp registers if we need it.
|
// Keep the scale/offset in a few fp registers if we need it.
|
||||||
if (prescaleStep) {
|
if (prescaleStep) {
|
||||||
LI(tempReg1, &gstate_c.uv);
|
// tempReg1 happens to be the fourth argument register.
|
||||||
FL(32, prescaleRegs.scale.u, tempReg1, 0);
|
FL(32, prescaleRegs.scale.u, tempReg1, 0);
|
||||||
FL(32, prescaleRegs.scale.v, tempReg1, 4);
|
FL(32, prescaleRegs.scale.v, tempReg1, 4);
|
||||||
FL(32, prescaleRegs.offset.u, tempReg1, 8);
|
FL(32, prescaleRegs.offset.u, tempReg1, 8);
|
||||||
|
@ -60,6 +60,7 @@ static const X64Reg tempReg3 = R10;
|
|||||||
static const X64Reg srcReg = RCX;
|
static const X64Reg srcReg = RCX;
|
||||||
static const X64Reg dstReg = RDX;
|
static const X64Reg dstReg = RDX;
|
||||||
static const X64Reg counterReg = R8;
|
static const X64Reg counterReg = R8;
|
||||||
|
static const X64Reg uvScalePtrReg = R9; // only used during init
|
||||||
static const X64Reg alphaReg = R11;
|
static const X64Reg alphaReg = R11;
|
||||||
#else
|
#else
|
||||||
static const X64Reg tempReg1 = RAX;
|
static const X64Reg tempReg1 = RAX;
|
||||||
@ -68,6 +69,7 @@ static const X64Reg tempReg3 = R10;
|
|||||||
static const X64Reg srcReg = RDI;
|
static const X64Reg srcReg = RDI;
|
||||||
static const X64Reg dstReg = RSI;
|
static const X64Reg dstReg = RSI;
|
||||||
static const X64Reg counterReg = RDX;
|
static const X64Reg counterReg = RDX;
|
||||||
|
static const X64Reg uvScalePtrReg = RCX; // only used during init
|
||||||
static const X64Reg alphaReg = R11;
|
static const X64Reg alphaReg = R11;
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
@ -77,6 +79,7 @@ static const X64Reg tempReg3 = EDX;
|
|||||||
static const X64Reg srcReg = ESI;
|
static const X64Reg srcReg = ESI;
|
||||||
static const X64Reg dstReg = EDI;
|
static const X64Reg dstReg = EDI;
|
||||||
static const X64Reg counterReg = ECX;
|
static const X64Reg counterReg = ECX;
|
||||||
|
static const X64Reg uvScalePtrReg = EDX; // only used during init
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// XMM0-XMM5 are volatile on Windows X64
|
// XMM0-XMM5 are volatile on Windows X64
|
||||||
@ -168,6 +171,22 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
BeginWrite(4096);
|
BeginWrite(4096);
|
||||||
const u8 *start = this->AlignCode16();
|
const u8 *start = this->AlignCode16();
|
||||||
|
|
||||||
|
bool prescaleStep = false;
|
||||||
|
// Look for prescaled texcoord steps
|
||||||
|
for (int i = 0; i < dec.numSteps_; i++) {
|
||||||
|
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
|
||||||
|
dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
|
||||||
|
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
|
||||||
|
prescaleStep = true;
|
||||||
|
}
|
||||||
|
if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
|
||||||
|
dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
|
||||||
|
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
|
||||||
|
prescaleStep = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#if PPSSPP_ARCH(X86)
|
#if PPSSPP_ARCH(X86)
|
||||||
// Store register values
|
// Store register values
|
||||||
PUSH(ESI);
|
PUSH(ESI);
|
||||||
@ -180,6 +199,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
|
MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
|
||||||
MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
|
MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
|
||||||
MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
|
MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
|
||||||
|
MOV(32, R(uvScalePtrReg), MDisp(ESP, 16 + offset + 12));
|
||||||
|
|
||||||
const uint8_t STACK_FIXED_ALLOC = 64;
|
const uint8_t STACK_FIXED_ALLOC = 64;
|
||||||
#else
|
#else
|
||||||
@ -210,52 +230,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool prescaleStep = false;
|
|
||||||
// Look for prescaled texcoord steps
|
|
||||||
for (int i = 0; i < dec.numSteps_; i++) {
|
|
||||||
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
|
|
||||||
dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
|
|
||||||
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
|
|
||||||
prescaleStep = true;
|
|
||||||
}
|
|
||||||
if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph ||
|
|
||||||
dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph ||
|
|
||||||
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) {
|
|
||||||
prescaleStep = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add code to convert matrices to 4x4.
|
|
||||||
// Later we might want to do this when the matrices are loaded instead.
|
|
||||||
if (dec.skinInDecode) {
|
|
||||||
MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks));
|
|
||||||
MOVAPS(XMM4, MatR(tempReg1));
|
|
||||||
MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne));
|
|
||||||
MOVUPS(XMM5, MatR(tempReg1));
|
|
||||||
MOV(PTRBITS, R(tempReg1), ImmPtr(gstate.boneMatrix));
|
|
||||||
MOV(PTRBITS, R(tempReg2), ImmPtr(bones));
|
|
||||||
for (int i = 0; i < dec.nweights; i++) {
|
|
||||||
MOVUPS(XMM0, MDisp(tempReg1, (12 * i) * 4));
|
|
||||||
MOVUPS(XMM1, MDisp(tempReg1, (12 * i + 3) * 4));
|
|
||||||
MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3 * 2) * 4));
|
|
||||||
MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 3) * 4));
|
|
||||||
ANDPS(XMM0, R(XMM4));
|
|
||||||
ANDPS(XMM1, R(XMM4));
|
|
||||||
ANDPS(XMM2, R(XMM4));
|
|
||||||
ANDPS(XMM3, R(XMM4));
|
|
||||||
ORPS(XMM3, R(XMM5));
|
|
||||||
MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM0);
|
|
||||||
MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM1);
|
|
||||||
MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM2);
|
|
||||||
MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM3);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Keep the scale/offset in a few fp registers if we need it.
|
// Keep the scale/offset in a few fp registers if we need it.
|
||||||
// TODO: Read it from an argument pointer instead of gstate_c.uv.
|
// TODO: Read it from an argument pointer instead of gstate_c.uv.
|
||||||
if (prescaleStep) {
|
if (prescaleStep) {
|
||||||
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.uv));
|
// uvScalePtrReg should point to gstate_c.uv, or wherever the UV scale we want to use is located.
|
||||||
MOVUPS(fpScaleOffsetReg, MatR(tempReg1));
|
MOVUPS(fpScaleOffsetReg, MatR(uvScalePtrReg));
|
||||||
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
|
||||||
MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
|
MOV(PTRBITS, R(tempReg2), ImmPtr(&by128_11));
|
||||||
MULPS(fpScaleOffsetReg, MatR(tempReg2));
|
MULPS(fpScaleOffsetReg, MatR(tempReg2));
|
||||||
@ -265,6 +244,33 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add code to convert matrices to 4x4.
|
||||||
|
// Later we might want to do this when the matrices are loaded instead.
|
||||||
|
// Can't touch fpScaleOffsetReg (XMM0) in here!
|
||||||
|
if (dec.skinInDecode) {
|
||||||
|
MOV(PTRBITS, R(tempReg1), ImmPtr(&threeMasks));
|
||||||
|
MOVAPS(XMM5, MatR(tempReg1));
|
||||||
|
MOV(PTRBITS, R(tempReg1), ImmPtr(&aOne));
|
||||||
|
MOVUPS(XMM6, MatR(tempReg1));
|
||||||
|
MOV(PTRBITS, R(tempReg1), ImmPtr(gstate.boneMatrix));
|
||||||
|
MOV(PTRBITS, R(tempReg2), ImmPtr(bones));
|
||||||
|
for (int i = 0; i < dec.nweights; i++) {
|
||||||
|
MOVUPS(XMM1, MDisp(tempReg1, (12 * i) * 4));
|
||||||
|
MOVUPS(XMM2, MDisp(tempReg1, (12 * i + 3) * 4));
|
||||||
|
MOVUPS(XMM3, MDisp(tempReg1, (12 * i + 3 * 2) * 4));
|
||||||
|
MOVUPS(XMM4, MDisp(tempReg1, (12 * i + 3 * 3) * 4));
|
||||||
|
ANDPS(XMM1, R(XMM5));
|
||||||
|
ANDPS(XMM2, R(XMM5));
|
||||||
|
ANDPS(XMM3, R(XMM5));
|
||||||
|
ANDPS(XMM4, R(XMM5));
|
||||||
|
ORPS(XMM4, R(XMM6));
|
||||||
|
MOVAPS(MDisp(tempReg2, (16 * i) * 4), XMM1);
|
||||||
|
MOVAPS(MDisp(tempReg2, (16 * i + 4) * 4), XMM2);
|
||||||
|
MOVAPS(MDisp(tempReg2, (16 * i + 8) * 4), XMM3);
|
||||||
|
MOVAPS(MDisp(tempReg2, (16 * i + 12) * 4), XMM4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Let's not bother with a proper stack frame. We just grab the arguments and go.
|
// Let's not bother with a proper stack frame. We just grab the arguments and go.
|
||||||
JumpTarget loopStart = GetCodePtr();
|
JumpTarget loopStart = GetCodePtr();
|
||||||
for (int i = 0; i < dec.numSteps_; i++) {
|
for (int i = 0; i < dec.numSteps_; i++) {
|
||||||
@ -775,6 +781,8 @@ void VertexDecoderJitCache::Jit_TcU8Prescale() {
|
|||||||
CVTSI2SS(fpScratchReg, R(tempReg1));
|
CVTSI2SS(fpScratchReg, R(tempReg1));
|
||||||
CVTSI2SS(fpScratchReg2, R(tempReg2));
|
CVTSI2SS(fpScratchReg2, R(tempReg2));
|
||||||
UNPCKLPS(fpScratchReg, R(fpScratchReg2));
|
UNPCKLPS(fpScratchReg, R(fpScratchReg2));
|
||||||
|
// TODO: These are a lot of nasty consecutive dependencies. Can probably be made faster
|
||||||
|
// if we can spare another register to avoid the shuffle, like on ARM.
|
||||||
MULPS(fpScratchReg, R(fpScaleOffsetReg));
|
MULPS(fpScratchReg, R(fpScaleOffsetReg));
|
||||||
SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
|
SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
ADDPS(fpScratchReg, R(fpScaleOffsetReg));
|
ADDPS(fpScratchReg, R(fpScaleOffsetReg));
|
||||||
|
@ -469,7 +469,7 @@ public:
|
|||||||
if (useIndices_)
|
if (useIndices_)
|
||||||
GetIndexBounds(indices, vertex_count, vertex_type, &lowerBound_, &upperBound_);
|
GetIndexBounds(indices, vertex_count, vertex_type, &lowerBound_, &upperBound_);
|
||||||
if (vertex_count != 0)
|
if (vertex_count != 0)
|
||||||
vdecoder.DecodeVerts(base, vertices, lowerBound_, upperBound_);
|
vdecoder.DecodeVerts(base, vertices, &gstate_c.uv, lowerBound_, upperBound_);
|
||||||
|
|
||||||
// If we're only using a subset of verts, it's better to decode with random access (usually.)
|
// If we're only using a subset of verts, it's better to decode with random access (usually.)
|
||||||
// However, if we're reusing a lot of verts, we should read and cache them.
|
// However, if we're reusing a lot of verts, we should read and cache them.
|
||||||
|
1
Windows/.gitignore
vendored
1
Windows/.gitignore
vendored
@ -2,3 +2,4 @@
|
|||||||
*.VC.db
|
*.VC.db
|
||||||
*.txt
|
*.txt
|
||||||
enc_temp_folder
|
enc_temp_folder
|
||||||
|
Win32
|
||||||
|
@ -78,7 +78,7 @@ public:
|
|||||||
void Execute(int vtype, int indexUpperBound, bool useJit) {
|
void Execute(int vtype, int indexUpperBound, bool useJit) {
|
||||||
SetupExecute(vtype, useJit);
|
SetupExecute(vtype, useJit);
|
||||||
|
|
||||||
dec_->DecodeVerts(dst_, src_, indexLowerBound_, indexUpperBound);
|
dec_->DecodeVerts(dst_, src_, &gstate_c.uv, indexLowerBound_, indexUpperBound);
|
||||||
}
|
}
|
||||||
|
|
||||||
double ExecuteTimed(int vtype, int indexUpperBound, bool useJit) {
|
double ExecuteTimed(int vtype, int indexUpperBound, bool useJit) {
|
||||||
@ -88,7 +88,7 @@ public:
|
|||||||
double st = time_now_d();
|
double st = time_now_d();
|
||||||
do {
|
do {
|
||||||
for (int j = 0; j < ROUNDS; ++j) {
|
for (int j = 0; j < ROUNDS; ++j) {
|
||||||
dec_->DecodeVerts(dst_, src_, indexLowerBound_, indexUpperBound);
|
dec_->DecodeVerts(dst_, src_, &gstate_c.uv, indexLowerBound_, indexUpperBound);
|
||||||
++total;
|
++total;
|
||||||
}
|
}
|
||||||
} while (time_now_d() - st < 0.5);
|
} while (time_now_d() - st < 0.5);
|
||||||
|
Loading…
Reference in New Issue
Block a user