d3d: Avoid rewriting textures, just swizzle.

Luckily A is in the same place and the same width, so we can do this for
all but framebuffers easily.

Technically we could do it in OpenGL as well.

Small (1-2%) performance improvement in FF2.
This commit is contained in:
Unknown W. Brackets 2014-08-28 01:20:21 -07:00
parent 140eb82821
commit 1f51fe7843
4 changed files with 8 additions and 104 deletions

View File

@ -167,6 +167,7 @@ void ComputeFragmentShaderIDDX9(FragmentShaderIDDX9 *id) {
id->d[0] |= (doTextureProjection & 1) << 16;
id->d[0] |= (enableColorDoubling & 1) << 17;
id->d[0] |= (enableAlphaDoubling & 1) << 18;
id->d[0] |= (gstate_c.bgraTexture & 1) << 19;
if (enableAlphaTest)
gpuStats.numAlphaTestedDraws++;
@ -253,9 +254,9 @@ void GenerateFragmentShaderDX9(char *buffer) {
if (gstate.isTextureMapEnabled()) {
if (doTextureProjection) {
WRITE(p, " float4 t = tex2Dproj(tex, float4(In.v_texcoord.x, In.v_texcoord.y, 0, In.v_texcoord.z));\n");
WRITE(p, " float4 t = tex2Dproj(tex, float4(In.v_texcoord.x, In.v_texcoord.y, 0, In.v_texcoord.z))%s;\n", gstate_c.bgraTexture ? ".bgra" : "");
} else {
WRITE(p, " float4 t = tex2D(tex, In.v_texcoord.xy);\n");
WRITE(p, " float4 t = tex2D(tex, In.v_texcoord.xy)%s;\n", gstate_c.bgraTexture ? ".bgra" : "");
}
WRITE(p, " float4 p = In.v_color0;\n");

View File

@ -523,58 +523,6 @@ void TextureCacheDX9::UpdateSamplingParams(TexCacheEntry &entry, bool force) {
dxstate.texAddressV.set(tClamp ? D3DTADDRESS_CLAMP : D3DTADDRESS_WRAP);
}
static inline u32 ABGR2RGBA(u32 src) {
return ((src & 0xFF000000)) |
((src & 0x00FF0000) >> 16) |
((src & 0x0000FF00)) |
((src & 0x000000FF) << 16);
}
static void ClutConvertColors(void *dstBuf, const void *srcBuf, u32 dstFmt, int numPixels) {
// TODO: All these can be further sped up with SSE or NEON.
switch (dstFmt) {
case D3DFMT_A1R5G5B5:
{
const u16_le *src = (const u16_le *)srcBuf;
u16 *dst = (u16 *)dstBuf;
for (int i = 0; i < numPixels; i++) {
u16 rgb = (src[i]);
((uint16_t *)dst)[i] = (rgb & 0x83E0) | ((rgb & 0x1F) << 10) | ((rgb & 0x7C00) >> 10);
}
}
break;
case D3DFMT_A4R4G4B4:
{
const u16_le *src = (const u16_le *)srcBuf;
u16_le *dst = (u16_le *)dstBuf;
for (int i = 0; i < numPixels; i++) {
u16 rgb = src[i];
dst[i] = ((rgb & 0xF) << 8) | (rgb & 0xF0F0) | ((rgb & 0xF00) >> 8);
}
}
break;
case D3DFMT_R5G6B5:
{
const u16_le *src = (const u16_le *)srcBuf;
u16 *dst = (u16 *)dstBuf;
for (int i = 0; i < numPixels; i++) {
u16 rgb = src[i];
dst[i] = ((rgb & 0x1f) << 11) | (rgb & 0x7e0) | ((rgb & 0xF800) >> 11);
}
}
break;
default:
{
const u32 *src = (const u32 *)srcBuf;
u32 *dst = (u32*)dstBuf;
for (int i = 0; i < numPixels; i++) {
dst[i] = ABGR2RGBA(src[i]);
}
}
break;
}
}
void TextureCacheDX9::StartFrame() {
lastBoundTexture = INVALID_TEX;
if (clearCacheNextFrame_) {
@ -589,37 +537,6 @@ static inline u32 MiniHash(const u32 *ptr) {
return ptr[0];
}
static inline u32 QuickClutHash(const u8 *clut, u32 bytes) {
// CLUTs always come in multiples of 32 bytes, can't load them any other way.
_dbg_assert_msg_(G3D, (bytes & 31) == 0, "CLUT should always have a multiple of 32 bytes.");
const u32 prime = 2246822519U;
u32 hash = 0;
#ifdef _M_SSE
if ((((u32)(intptr_t)clut) & 0xf) == 0) {
__m128i cursor = _mm_set1_epi32(0);
const __m128i mult = _mm_set1_epi32(prime);
const __m128i *p = (const __m128i *)clut;
for (u32 i = 0; i < bytes / 16; ++i) {
cursor = _mm_add_epi32(cursor, _mm_mul_epu32(_mm_load_si128(&p[i]), mult));
}
// Add the four parts into the low i32.
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
hash = _mm_cvtsi128_si32(cursor);
} else {
#else
// TODO: ARM NEON implementation (using CPUDetect to be sure it has NEON.)
{
#endif
for (const u32 *p = (u32 *)clut, *end = (u32 *)(clut + bytes); p < end; ) {
hash += *p++ * prime;
}
}
return hash;
}
static inline u32 QuickTexHash(u32 addr, int bufw, int w, int h, GETextureFormat format) {
const u32 sizeInRAM = (textureBitsPerPixel[format] * bufw * h) / 8;
const u32 *checkp = (const u32 *) Memory::GetPointer(addr);
@ -653,19 +570,7 @@ void TextureCacheDX9::UpdateCurrentClut() {
const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;
clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
/*
// Avoid a copy when we don't need to convert colors.
if (clutFormat != GE_CMODE_32BIT_ABGR8888) {
ClutConvertColors(clutBufConverted_, clutBufRaw_, getClutDestFormat(clutFormat), clutExtendedBytes / sizeof(u16));
clutBuf_ = clutBufConverted_;
} else {
clutBuf_ = clutBufRaw_;
}
*/
ClutConvertColors(clutBufConverted_, clutBufRaw_, getClutDestFormat(clutFormat), clutExtendedBytes / sizeof(u16));
clutBuf_ = clutBufConverted_;
//clutBuf_ = clutBufRaw_;
clutBuf_ = clutBufRaw_;
// Special optimization: fonts typically draw clut4 with just alpha values in a single color.
clutAlphaLinear_ = false;
@ -720,6 +625,7 @@ void TextureCacheDX9::SetTextureFramebuffer(TexCacheEntry *entry)
gstate_c.curTextureWidth = entry->framebuffer->width;
gstate_c.curTextureHeight = entry->framebuffer->height;
gstate_c.flipTexture = false;
gstate_c.bgraTexture = false;
gstate_c.textureFullAlpha = entry->framebuffer->format == GE_FORMAT_565;
} else {
if (entry->framebuffer->fbo)
@ -780,6 +686,7 @@ void TextureCacheDX9::SetTexture(bool force) {
TexCache::iterator iter = cache.find(cachekey);
TexCacheEntry *entry = NULL;
gstate_c.flipTexture = false;
gstate_c.bgraTexture = true;
gstate_c.skipDrawReason &= ~SKIPDRAW_BAD_FB_TEXTURE;
bool useBufferedRendering = g_Config.iRenderingMode != FB_NON_BUFFERED_MODE;
bool replaceImages = false;
@ -1118,7 +1025,6 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
tmpTexBuf32.resize(std::max(bufw, w) * h);
finalBuf = UnswizzleFromMem(texaddr, bufw, 2, level);
}
ClutConvertColors(finalBuf, finalBuf, dstFmt, bufw * h);
break;
case GE_TFMT_8888:
@ -1140,7 +1046,6 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
tmpTexBuf32.resize(std::max(bufw, w) * h);
finalBuf = UnswizzleFromMem(texaddr, bufw, 4, level);
}
ClutConvertColors(finalBuf, finalBuf, dstFmt, bufw * h);
break;
case GE_TFMT_DXT1:
@ -1160,7 +1065,6 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
}
}
finalBuf = tmpTexBuf32.data();
ClutConvertColors(finalBuf, finalBuf, dstFmt, bufw * h);
w = (w + 3) & ~3;
}
break;
@ -1183,7 +1087,6 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
}
w = (w + 3) & ~3;
finalBuf = tmpTexBuf32.data();
ClutConvertColors(finalBuf, finalBuf, dstFmt, bufw * h);
}
break;
@ -1205,7 +1108,6 @@ void *TextureCacheDX9::DecodeTextureLevel(GETextureFormat format, GEPaletteForma
}
w = (w + 3) & ~3;
finalBuf = tmpTexBuf32.data();
ClutConvertColors(finalBuf, finalBuf, dstFmt, bufw * h);
}
break;

View File

@ -298,7 +298,7 @@ void GPUStateCache::DoState(PointerWrap &p) {
p.Do(flipTexture);
}
// needShaderTexClamp doesn't need to be saved.
// needShaderTexClamp and bgraTexture don't need to be saved.
if (s >= 3) {
p.Do(textureSimpleAlpha);

View File

@ -460,6 +460,7 @@ struct GPUStateCache
UVScale uv;
bool flipTexture;
bool bgraTexture;
bool needShaderTexClamp;
bool allowShaderBlend;