Merge pull request #5696 from unknownbrackets/texcache

Use NEON for unswizzling, minor tweak to texcache
This commit is contained in:
Henrik Rydgård 2014-03-23 09:45:36 +01:00
commit 941b8b4663
6 changed files with 97 additions and 68 deletions

View File

@ -105,16 +105,65 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) {
return check;
}
void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
#ifdef _M_SSE
const __m128i *src = (const __m128i *)texptr;
for (int by = 0; by < byc; by++) {
__m128i *xdest = (__m128i *)ydestp;
for (int bx = 0; bx < bxc; bx++) {
__m128i *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
__m128i temp2 = _mm_load_si128(src + 1);
__m128i temp3 = _mm_load_si128(src + 2);
__m128i temp4 = _mm_load_si128(src + 3);
_mm_store_si128(dest, temp1);
dest += pitch >> 2;
_mm_store_si128(dest, temp2);
dest += pitch >> 2;
_mm_store_si128(dest, temp3);
dest += pitch >> 2;
_mm_store_si128(dest, temp4);
dest += pitch >> 2;
src += 4;
}
xdest ++;
}
ydestp += (rowWidth * 8) / 4;
}
#else
const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
#endif
}
QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
// This has to be done after CPUDetect has done its magic.
void SetupQuickTexHash() {
void SetupTextureDecoder() {
#ifdef ARMV7
if (cpu_info.bNEON)
if (cpu_info.bNEON) {
DoQuickTexHash = &QuickTexHashNEON;
DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
}
#elif _M_SSE
if (cpu_info.bSSE2)
if (cpu_info.bSSE2) {
DoQuickTexHash = &QuickTexHashSSE2;
}
#endif
}

View File

@ -22,10 +22,14 @@
#include "GPU/ge_constants.h"
#include "GPU/GPUState.h"
void SetupQuickTexHash();
void SetupTextureDecoder();
typedef u32 (*QuickTexHashFunc)(const void *checkp, u32 size);
extern QuickTexHashFunc DoQuickTexHash;
typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
extern UnswizzleTex16Func DoUnswizzleTex16;
// All these DXT structs are in the reverse order, as compared to PC.
// On PC, alpha comes before color, and interpolants are before the tile data.

View File

@ -84,7 +84,8 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
// Okay, do the memory hashing.
"QuickTexHashNEON_next:\n"
"pld [%2, #0xc0]\n"
"vldmia %2!, {d16-d23}\n"
"vld1.32 {d16, d17, d18, d19}, [%2, :128]!\n"
"vld1.32 {d20, d21, d22, d23}, [%2, :128]!\n"
"vmla.i32 q0, q1, q8\n"
"vmul.i32 q11, q11, q1\n"
"veor.i32 q0, q0, q9\n"
@ -118,3 +119,34 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
return check;
}
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
__builtin_prefetch(texptr, 0, 0);
__builtin_prefetch(ydestp, 1, 1);
const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
uint32x4_t temp1 = vld1q_u32(src);
uint32x4_t temp2 = vld1q_u32(src + 4);
uint32x4_t temp3 = vld1q_u32(src + 8);
uint32x4_t temp4 = vld1q_u32(src + 12);
vst1q_u32(dest, temp1);
dest += pitch;
vst1q_u32(dest, temp2);
dest += pitch;
vst1q_u32(dest, temp3);
dest += pitch;
vst1q_u32(dest, temp4);
dest += pitch;
src += 16;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
}

View File

@ -17,4 +17,5 @@
#include "GPU/Common/TextureDecoder.h"
u32 QuickTexHashNEON(const void *checkp, u32 size);
u32 QuickTexHashNEON(const void *checkp, u32 size);
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);

View File

@ -58,7 +58,7 @@ TextureCacheDX9::TextureCacheDX9() : clearCacheNextFrame_(false), lowMemoryMode_
clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
// glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
maxAnisotropyLevel = 16;
SetupQuickTexHash();
SetupTextureDecoder();
#ifdef _XBOX
// TODO: Maybe not? This decimates more often, but it may be speed harmful if unnecessary.
lowMemoryMode_ = true;
@ -286,21 +286,9 @@ void *TextureCacheDX9::UnswizzleFromMem(u32 texaddr, u32 bufw, u32 bytesPerPixel
u32 ydest = 0;
if (rowWidth >= 16) {
const u32 *src = (u32 *) Memory::GetPointer(texaddr);
u32 *ydestp = tmpTexBuf32.data();
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
// The most common one, so it gets an optimized implementation.
DoUnswizzleTex16(Memory::GetPointer(texaddr), ydestp, bxc, byc, pitch, rowWidth);
} else if (rowWidth == 8) {
const u32 *src = (u32 *) Memory::GetPointer(texaddr);
for (int by = 0; by < byc; by++) {

View File

@ -66,7 +66,7 @@ TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false
clutBufConverted_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
SetupQuickTexHash();
SetupTextureDecoder();
}
TextureCache::~TextureCache() {
@ -285,51 +285,6 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe
}
}
static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
#ifdef _M_SSE
const __m128i *src = (const __m128i *)texptr;
for (int by = 0; by < byc; by++) {
__m128i *xdest = (__m128i *)ydestp;
for (int bx = 0; bx < bxc; bx++) {
__m128i *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
__m128i temp2 = _mm_load_si128(src + 1);
__m128i temp3 = _mm_load_si128(src + 2);
__m128i temp4 = _mm_load_si128(src + 3);
_mm_store_si128(dest, temp1);
dest += pitch >> 2;
_mm_store_si128(dest, temp2);
dest += pitch >> 2;
_mm_store_si128(dest, temp3);
dest += pitch >> 2;
_mm_store_si128(dest, temp4);
dest += pitch >> 2;
src += 4;
}
xdest ++;
}
ydestp += (rowWidth * 8) / 4;
}
#else
const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
#endif
}
void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) {
const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
const u32 pitch = rowWidth / 4;
@ -342,7 +297,7 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix
if (rowWidth >= 16) {
u32 *ydestp = tmpTexBuf32.data();
// The most common one, so it gets an optimized implementation.
Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth);
DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
} else if (rowWidth == 8) {
const u32 *src = (const u32 *) texptr;
for (int by = 0; by < byc; by++) {