mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-02-26 08:55:58 +00:00
Merge pull request #5710 from hrydgard/avoid-alpha-test
Avoid alpha test when vertexFullAlpha && textureFullAlpha
This commit is contained in:
commit
ff498ed63b
@ -64,11 +64,20 @@ static bool IsAlphaTestTriviallyTrue() {
|
||||
return true;
|
||||
|
||||
case GE_COMP_GEQUAL:
|
||||
if (gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed()))
|
||||
return true; // If alpha is full, it doesn't matter what the ref value is.
|
||||
return gstate.getAlphaTestRef() == 0;
|
||||
|
||||
// Non-zero check. If we have no depth testing (and thus no depth writing), and an alpha func that will result in no change if zero alpha, get rid of the alpha test.
|
||||
// Speeds up Lumines by a LOT on PowerVR.
|
||||
case GE_COMP_NOTEQUAL:
|
||||
if ((gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed())) && gstate.getAlphaTestRef() == 255) {
|
||||
// Likely to be rare. Let's just have the alpha test take care of this instead of adding
|
||||
// complicated code to discard the draw or whatnot.
|
||||
return false;
|
||||
}
|
||||
// Fallthrough on purpose
|
||||
|
||||
case GE_COMP_GREATER:
|
||||
{
|
||||
#if 0
|
||||
|
@ -265,11 +265,6 @@ void TransformDrawEngine::SetupVertexDecoder(u32 vertType) {
|
||||
if (vertTypeID != lastVType_) {
|
||||
dec_ = GetVertexDecoder(vertTypeID);
|
||||
lastVType_ = vertTypeID;
|
||||
|
||||
// TODO: Add functionality to VertexDecoder to scan for non-full alpha in the two other formats,
|
||||
// which are quite common.
|
||||
int colorType = vertTypeID & GE_VTYPE_COL_MASK;
|
||||
gstate_c.vertexFullAlpha = colorType == GE_VTYPE_COL_NONE || colorType == GE_VTYPE_COL_565;
|
||||
}
|
||||
}
|
||||
|
||||
@ -566,6 +561,8 @@ void TransformDrawEngine::DoFlush() {
|
||||
vai->numVerts = indexGen.VertexCount();
|
||||
vai->prim = indexGen.Prim();
|
||||
vai->maxIndex = indexGen.MaxIndex();
|
||||
vai->flags = gstate_c.vertexFullAlpha ? VAI_FLAG_VERTEXFULLALPHA : 0;
|
||||
|
||||
goto rotateVBO;
|
||||
}
|
||||
|
||||
@ -645,6 +642,8 @@ void TransformDrawEngine::DoFlush() {
|
||||
vertexCount = vai->numVerts;
|
||||
maxIndex = vai->maxIndex;
|
||||
prim = static_cast<GEPrimitiveType>(vai->prim);
|
||||
|
||||
gstate_c.vertexFullAlpha = vai->flags & VAI_FLAG_VERTEXFULLALPHA;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -665,6 +664,8 @@ void TransformDrawEngine::DoFlush() {
|
||||
vertexCount = vai->numVerts;
|
||||
maxIndex = vai->maxIndex;
|
||||
prim = static_cast<GEPrimitiveType>(vai->prim);
|
||||
|
||||
gstate_c.vertexFullAlpha = vai->flags & VAI_FLAG_VERTEXFULLALPHA;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -698,6 +699,12 @@ rotateVBO:
|
||||
}
|
||||
|
||||
VERBOSE_LOG(G3D, "Flush prim %i! %i verts in one go", prim, vertexCount);
|
||||
bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE;
|
||||
if (gstate.isModeThrough()) {
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255);
|
||||
} else {
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255);
|
||||
}
|
||||
|
||||
LinkedShader *program = shaderManager_->ApplyFragmentShader(vshader, prim, lastVType_);
|
||||
SetupDecFmtForDraw(program, dec_->GetDecVtxFmt(), vbo ? 0 : decoded);
|
||||
@ -717,6 +724,13 @@ rotateVBO:
|
||||
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
||||
} else {
|
||||
DecodeVerts();
|
||||
bool hasColor = (lastVType_ & GE_VTYPE_COL_MASK) != GE_VTYPE_COL_NONE;
|
||||
if (gstate.isModeThrough()) {
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && (hasColor || gstate.getMaterialAmbientA() == 255);
|
||||
} else {
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && ((hasColor && (gstate.materialupdate & 1)) || gstate.getMaterialAmbientA() == 255) && (!gstate.isLightingEnabled() || gstate.getAmbientA() == 255);
|
||||
}
|
||||
|
||||
LinkedShader *program = shaderManager_->ApplyFragmentShader(vshader, prim, lastVType_);
|
||||
gpuStats.numUncachedVertsDrawn += indexGen.VertexCount();
|
||||
prim = indexGen.Prim();
|
||||
@ -737,6 +751,7 @@ rotateVBO:
|
||||
decodeCounter_ = 0;
|
||||
dcid_ = 0;
|
||||
prevPrim_ = GE_PRIM_INVALID;
|
||||
gstate_c.vertexFullAlpha = true;
|
||||
|
||||
#ifndef MOBILE_DEVICE
|
||||
host->GPUNotifyDraw();
|
||||
|
@ -43,6 +43,10 @@ struct DecVtxFormat;
|
||||
// DRAWN_ONCE -> death
|
||||
// DRAWN_RELIABLE -> death
|
||||
|
||||
enum {
|
||||
VAI_FLAG_VERTEXFULLALPHA = 1,
|
||||
};
|
||||
|
||||
// Try to keep this POD.
|
||||
class VertexArrayInfo {
|
||||
public:
|
||||
@ -57,6 +61,7 @@ public:
|
||||
lastFrame = gpuStats.numFlips;
|
||||
numVerts = 0;
|
||||
drawsUntilNextFullHash = 0;
|
||||
flags = 0;
|
||||
}
|
||||
~VertexArrayInfo();
|
||||
|
||||
@ -85,6 +90,7 @@ public:
|
||||
int numFrames;
|
||||
int lastFrame; // So that we can forget.
|
||||
u16 drawsUntilNextFullHash;
|
||||
u8 flags;
|
||||
};
|
||||
|
||||
// Handles transform, lighting and drawing.
|
||||
|
@ -219,6 +219,7 @@ void VertexDecoder::Step_Color565() const
|
||||
c[1] = Convert6To8((cdata>>5) & 0x3f);
|
||||
c[2] = Convert5To8((cdata>>11) & 0x1f);
|
||||
c[3] = 255;
|
||||
// Always full alpha.
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color5551() const
|
||||
@ -229,6 +230,7 @@ void VertexDecoder::Step_Color5551() const
|
||||
c[1] = Convert5To8((cdata>>5) & 0x1f);
|
||||
c[2] = Convert5To8((cdata>>10) & 0x1f);
|
||||
c[3] = (cdata >> 15) ? 255 : 0;
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] != 0;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color4444() const
|
||||
@ -237,6 +239,7 @@ void VertexDecoder::Step_Color4444() const
|
||||
u16 cdata = *(u16*)(ptr_ + coloff);
|
||||
for (int j = 0; j < 4; j++)
|
||||
c[j] = Convert4To8((cdata >> (j * 4)) & 0xF);
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color8888() const
|
||||
@ -244,6 +247,7 @@ void VertexDecoder::Step_Color8888() const
|
||||
u8 *c = decoded_ + decFmt.c0off;
|
||||
const u8 *cdata = (const u8*)(ptr_ + coloff);
|
||||
memcpy(c, cdata, sizeof(u8) * 4);
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color565Morph() const
|
||||
@ -262,6 +266,7 @@ void VertexDecoder::Step_Color565Morph() const
|
||||
c[i] = (u8)col[i];
|
||||
}
|
||||
c[3] = 255;
|
||||
// Always full alpha.
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color5551Morph() const
|
||||
@ -280,6 +285,7 @@ void VertexDecoder::Step_Color5551Morph() const
|
||||
for (int i = 0; i < 4; i++) {
|
||||
c[i] = (u8)col[i];
|
||||
}
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color4444Morph() const
|
||||
@ -296,6 +302,7 @@ void VertexDecoder::Step_Color4444Morph() const
|
||||
for (int i = 0; i < 4; i++) {
|
||||
c[i] = (u8)col[i];
|
||||
}
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_Color8888Morph() const
|
||||
@ -312,6 +319,7 @@ void VertexDecoder::Step_Color8888Morph() const
|
||||
for (int i = 0; i < 4; i++) {
|
||||
c[i] = (u8)(col[i]);
|
||||
}
|
||||
gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_NormalS8() const
|
||||
@ -841,6 +849,7 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
|
||||
jitted_(ptr_, decoded_, count);
|
||||
} else {
|
||||
// Interpret the decode steps
|
||||
// TODO: Init gstate_c.vertexFullAlpha here? Or in Setup? When is it reset?
|
||||
for (; count; count--) {
|
||||
for (int i = 0; i < numSteps_; i++) {
|
||||
((*this).*steps_[i])();
|
||||
|
@ -266,6 +266,6 @@ private:
|
||||
bool CompileStep(const VertexDecoder &dec, int i);
|
||||
void Jit_ApplyWeights();
|
||||
void Jit_WriteMatrixMul(int outOff, bool pos);
|
||||
void Jit_WriteMorphColor(int outOff);
|
||||
void Jit_WriteMorphColor(int outOff, bool checkAlpha = true);
|
||||
const VertexDecoder *dec_;
|
||||
};
|
||||
|
@ -61,7 +61,8 @@ static const ARMReg tempReg2 = R4;
|
||||
static const ARMReg tempReg3 = R5;
|
||||
static const ARMReg scratchReg = R6;
|
||||
static const ARMReg scratchReg2 = R7;
|
||||
static const ARMReg scratchReg3 = R12;
|
||||
static const ARMReg scratchReg3 = R8;
|
||||
static const ARMReg fullAlphaReg = R12;
|
||||
static const ARMReg srcReg = R0;
|
||||
static const ARMReg dstReg = R1;
|
||||
static const ARMReg counterReg = R2;
|
||||
@ -262,6 +263,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
// TODO: Preload scale factors
|
||||
}
|
||||
|
||||
if (dec.col) {
|
||||
// Or LDB and skip the conditional? This is probably cheaper.
|
||||
MOV(fullAlphaReg, 0xFF);
|
||||
}
|
||||
|
||||
JumpTarget loopStart = GetCodePtr();
|
||||
// Preload data cache ahead of reading. This offset seems pretty good.
|
||||
PLD(srcReg, 64);
|
||||
@ -281,6 +287,14 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
SUBS(counterReg, counterReg, 1);
|
||||
B_CC(CC_NEQ, loopStart);
|
||||
|
||||
if (dec.col) {
|
||||
MOVP2R(tempReg1, &gstate_c.textureFullAlpha);
|
||||
CMP(fullAlphaReg, 0);
|
||||
SetCC(CC_EQ);
|
||||
STRB(fullAlphaReg, tempReg1, 0);
|
||||
SetCC(CC_AL);
|
||||
}
|
||||
|
||||
if (NEONSkinning || NEONMorphing) {
|
||||
VPOP(D8, 8);
|
||||
}
|
||||
@ -664,7 +678,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() {
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color8888() {
|
||||
LDR(tempReg1, srcReg, dec_->coloff);
|
||||
// Set flags to determine if alpha != 0xFF.
|
||||
MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
|
||||
STR(tempReg1, dstReg, dec_->decFmt.c0off);
|
||||
SetCC(CC_NEQ);
|
||||
MOV(fullAlphaReg, 0);
|
||||
SetCC(CC_AL);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color4444() {
|
||||
@ -679,10 +698,16 @@ void VertexDecoderJitCache::Jit_Color4444() {
|
||||
ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg);
|
||||
ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12));
|
||||
|
||||
// And saturate.
|
||||
// And expand to 8 bits.
|
||||
ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4));
|
||||
|
||||
STR(tempReg1, dstReg, dec_->decFmt.c0off);
|
||||
|
||||
// Set flags to determine if alpha != 0xFF.
|
||||
MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24));
|
||||
SetCC(CC_NEQ);
|
||||
MOV(fullAlphaReg, 0);
|
||||
SetCC(CC_AL);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color565() {
|
||||
@ -706,7 +731,7 @@ void VertexDecoderJitCache::Jit_Color565() {
|
||||
ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4));
|
||||
ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8));
|
||||
|
||||
// Add in full alpha.
|
||||
// Add in full alpha. No need to update fullAlphaReg.
|
||||
ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg);
|
||||
|
||||
STR(tempReg1, dstReg, dec_->decFmt.c0off);
|
||||
@ -731,8 +756,13 @@ void VertexDecoderJitCache::Jit_Color5551() {
|
||||
// Now we just need alpha. Since we loaded as signed, it'll be extended.
|
||||
ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
|
||||
ORR(tempReg2, tempReg2, tempReg1);
|
||||
|
||||
|
||||
// Set flags to determine if alpha != 0xFF.
|
||||
MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24));
|
||||
STR(tempReg2, dstReg, dec_->decFmt.c0off);
|
||||
SetCC(CC_NEQ);
|
||||
MOV(fullAlphaReg, 0);
|
||||
SetCC(CC_AL);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color8888Morph() {
|
||||
@ -957,7 +987,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() {
|
||||
} else {
|
||||
VMOV(S11, tempReg3);
|
||||
}
|
||||
Jit_WriteMorphColor(dec_->decFmt.c0off);
|
||||
Jit_WriteMorphColor(dec_->decFmt.c0off, false);
|
||||
}
|
||||
|
||||
// First is the left shift, second is the right shift (against walls, to get the RGBA values.)
|
||||
@ -1045,13 +1075,16 @@ void VertexDecoderJitCache::Jit_Color5551Morph() {
|
||||
}
|
||||
|
||||
// Expects RGBA color in S8 - S11, which is Q2.
|
||||
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
|
||||
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
|
||||
if (NEONMorphing) {
|
||||
ADDI2R(tempReg1, dstReg, outOff, scratchReg);
|
||||
VCVT(I_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
|
||||
VQMOVN(I_32 | I_UNSIGNED, neonScratchReg, neonScratchRegQ);
|
||||
VQMOVN(I_16 | I_UNSIGNED, neonScratchReg, neonScratchRegQ);
|
||||
VST1_lane(I_32, neonScratchReg, tempReg1, 0, true);
|
||||
if (checkAlpha) {
|
||||
VMOV_neon(I_32, scratchReg, neonScratchReg, 0);
|
||||
}
|
||||
} else {
|
||||
VCVT(S8, S8, TO_INT);
|
||||
VCVT(S9, S9, TO_INT);
|
||||
@ -1066,6 +1099,14 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
|
||||
ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24));
|
||||
STR(scratchReg, dstReg, outOff);
|
||||
}
|
||||
|
||||
// Set flags to determine if alpha != 0xFF.
|
||||
if (checkAlpha) {
|
||||
MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24));
|
||||
SetCC(CC_NEQ);
|
||||
MOV(fullAlphaReg, 0);
|
||||
SetCC(CC_AL);
|
||||
}
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_NormalS8() {
|
||||
|
@ -187,7 +187,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
|
||||
dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
|
||||
dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
|
||||
prescaleStep = true;
|
||||
prescaleStep = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -556,6 +556,11 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() {
|
||||
void VertexDecoderJitCache::Jit_Color8888() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
|
||||
|
||||
CMP(32, R(tempReg1), Imm32(0xFF000000));
|
||||
FixupBranch skip = J_CC(CC_GE, false);
|
||||
MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0));
|
||||
SetJumpTarget(skip);
|
||||
}
|
||||
|
||||
static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, };
|
||||
@ -625,6 +630,11 @@ void VertexDecoderJitCache::Jit_Color4444() {
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
|
||||
|
||||
CMP(32, R(tempReg2), Imm32(0xFF000000));
|
||||
FixupBranch skip = J_CC(CC_AE, false);
|
||||
MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0));
|
||||
SetJumpTarget(skip);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color565() {
|
||||
@ -661,6 +671,7 @@ void VertexDecoderJitCache::Jit_Color565() {
|
||||
OR(32, R(tempReg2), R(tempReg1));
|
||||
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
|
||||
// Never has alpha, no need to update fullAlphaArg.
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color5551() {
|
||||
@ -696,6 +707,11 @@ void VertexDecoderJitCache::Jit_Color5551() {
|
||||
OR(32, R(tempReg2), R(tempReg1));
|
||||
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
|
||||
|
||||
CMP(32, R(tempReg2), Imm32(0xFF000000));
|
||||
FixupBranch skip = J_CC(CC_AE, false);
|
||||
MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0));
|
||||
SetJumpTarget(skip);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color8888Morph() {
|
||||
@ -825,7 +841,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() {
|
||||
}
|
||||
}
|
||||
|
||||
Jit_WriteMorphColor(dec_->decFmt.c0off);
|
||||
Jit_WriteMorphColor(dec_->decFmt.c0off, false);
|
||||
}
|
||||
|
||||
// Intentionally in reverse order.
|
||||
@ -884,12 +900,21 @@ void VertexDecoderJitCache::Jit_Color5551Morph() {
|
||||
Jit_WriteMorphColor(dec_->decFmt.c0off);
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
|
||||
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) {
|
||||
// Pack back into a u32.
|
||||
CVTPS2DQ(fpScratchReg, R(fpScratchReg));
|
||||
PACKSSDW(fpScratchReg, R(fpScratchReg));
|
||||
PACKUSWB(fpScratchReg, R(fpScratchReg));
|
||||
MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg);
|
||||
|
||||
// TODO: May be a faster way to do this without the MOVD.
|
||||
if (checkAlpha) {
|
||||
MOVD_xmm(R(tempReg1), fpScratchReg);
|
||||
CMP(32, R(tempReg1), Imm32(0xFF000000));
|
||||
FixupBranch skip = J_CC(CC_AE, false);
|
||||
MOV(8, M(&gstate_c.textureFullAlpha), Imm8(0));
|
||||
SetJumpTarget(skip);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy 3 bytes and then a zero. Might as well copy four.
|
||||
@ -983,17 +1008,6 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
|
||||
|
||||
// Through expands into floats, always. Might want to look at changing this.
|
||||
void VertexDecoderJitCache::Jit_PosS16Through() {
|
||||
// This commented out version is likely slightly faster but treats all three as signed, which
|
||||
// appears to be wrong.
|
||||
/*
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 16);
|
||||
PSRAD(XMM1, 16); // Ugly sign extension, can be done faster in SSE4
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
|
||||
*/
|
||||
MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff));
|
||||
MOVSX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 2));
|
||||
MOVZX(32, 16, tempReg3, MDisp(srcReg, dec_->posoff + 4)); // NOTE: MOVZX
|
||||
|
Loading…
x
Reference in New Issue
Block a user