mirror of
https://github.com/libretro/ppsspp.git
synced 2025-03-03 14:09:45 +00:00
Merge pull request #5696 from unknownbrackets/texcache
Use NEON for unswizzling, minor tweak to texcache
This commit is contained in:
commit
941b8b4663
@ -105,16 +105,65 @@ static u32 QuickTexHashBasic(const void *checkp, u32 size) {
|
||||
return check;
|
||||
}
|
||||
|
||||
void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i *src = (const __m128i *)texptr;
|
||||
for (int by = 0; by < byc; by++) {
|
||||
__m128i *xdest = (__m128i *)ydestp;
|
||||
for (int bx = 0; bx < bxc; bx++) {
|
||||
__m128i *dest = xdest;
|
||||
for (int n = 0; n < 2; n++) {
|
||||
// Textures are always 16-byte aligned so this is fine.
|
||||
__m128i temp1 = _mm_load_si128(src);
|
||||
__m128i temp2 = _mm_load_si128(src + 1);
|
||||
__m128i temp3 = _mm_load_si128(src + 2);
|
||||
__m128i temp4 = _mm_load_si128(src + 3);
|
||||
_mm_store_si128(dest, temp1);
|
||||
dest += pitch >> 2;
|
||||
_mm_store_si128(dest, temp2);
|
||||
dest += pitch >> 2;
|
||||
_mm_store_si128(dest, temp3);
|
||||
dest += pitch >> 2;
|
||||
_mm_store_si128(dest, temp4);
|
||||
dest += pitch >> 2;
|
||||
src += 4;
|
||||
}
|
||||
xdest ++;
|
||||
}
|
||||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
#else
|
||||
const u32 *src = (const u32 *)texptr;
|
||||
for (int by = 0; by < byc; by++) {
|
||||
u32 *xdest = ydestp;
|
||||
for (int bx = 0; bx < bxc; bx++) {
|
||||
u32 *dest = xdest;
|
||||
for (int n = 0; n < 8; n++) {
|
||||
memcpy(dest, src, 16);
|
||||
dest += pitch;
|
||||
src += 4;
|
||||
}
|
||||
xdest += 4;
|
||||
}
|
||||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
|
||||
UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
|
||||
|
||||
// This has to be done after CPUDetect has done its magic.
|
||||
void SetupQuickTexHash() {
|
||||
void SetupTextureDecoder() {
|
||||
#ifdef ARMV7
|
||||
if (cpu_info.bNEON)
|
||||
if (cpu_info.bNEON) {
|
||||
DoQuickTexHash = &QuickTexHashNEON;
|
||||
DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
|
||||
}
|
||||
#elif _M_SSE
|
||||
if (cpu_info.bSSE2)
|
||||
if (cpu_info.bSSE2) {
|
||||
DoQuickTexHash = &QuickTexHashSSE2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -22,10 +22,14 @@
|
||||
#include "GPU/ge_constants.h"
|
||||
#include "GPU/GPUState.h"
|
||||
|
||||
void SetupQuickTexHash();
|
||||
void SetupTextureDecoder();
|
||||
|
||||
typedef u32 (*QuickTexHashFunc)(const void *checkp, u32 size);
|
||||
extern QuickTexHashFunc DoQuickTexHash;
|
||||
|
||||
typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
|
||||
extern UnswizzleTex16Func DoUnswizzleTex16;
|
||||
|
||||
// All these DXT structs are in the reverse order, as compared to PC.
|
||||
// On PC, alpha comes before color, and interpolants are before the tile data.
|
||||
|
||||
|
@ -84,7 +84,8 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
// Okay, do the memory hashing.
|
||||
"QuickTexHashNEON_next:\n"
|
||||
"pld [%2, #0xc0]\n"
|
||||
"vldmia %2!, {d16-d23}\n"
|
||||
"vld1.32 {d16, d17, d18, d19}, [%2, :128]!\n"
|
||||
"vld1.32 {d20, d21, d22, d23}, [%2, :128]!\n"
|
||||
"vmla.i32 q0, q1, q8\n"
|
||||
"vmul.i32 q11, q11, q1\n"
|
||||
"veor.i32 q0, q0, q9\n"
|
||||
@ -118,3 +119,34 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
|
||||
|
||||
return check;
|
||||
}
|
||||
|
||||
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
|
||||
__builtin_prefetch(texptr, 0, 0);
|
||||
__builtin_prefetch(ydestp, 1, 1);
|
||||
|
||||
const u32 *src = (const u32 *)texptr;
|
||||
for (int by = 0; by < byc; by++) {
|
||||
u32 *xdest = ydestp;
|
||||
for (int bx = 0; bx < bxc; bx++) {
|
||||
u32 *dest = xdest;
|
||||
for (int n = 0; n < 2; n++) {
|
||||
// Textures are always 16-byte aligned so this is fine.
|
||||
uint32x4_t temp1 = vld1q_u32(src);
|
||||
uint32x4_t temp2 = vld1q_u32(src + 4);
|
||||
uint32x4_t temp3 = vld1q_u32(src + 8);
|
||||
uint32x4_t temp4 = vld1q_u32(src + 12);
|
||||
vst1q_u32(dest, temp1);
|
||||
dest += pitch;
|
||||
vst1q_u32(dest, temp2);
|
||||
dest += pitch;
|
||||
vst1q_u32(dest, temp3);
|
||||
dest += pitch;
|
||||
vst1q_u32(dest, temp4);
|
||||
dest += pitch;
|
||||
src += 16;
|
||||
}
|
||||
xdest += 4;
|
||||
}
|
||||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
}
|
||||
|
@ -17,4 +17,5 @@
|
||||
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
|
||||
u32 QuickTexHashNEON(const void *checkp, u32 size);
|
||||
u32 QuickTexHashNEON(const void *checkp, u32 size);
|
||||
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
|
||||
|
@ -58,7 +58,7 @@ TextureCacheDX9::TextureCacheDX9() : clearCacheNextFrame_(false), lowMemoryMode_
|
||||
clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
|
||||
// glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
|
||||
maxAnisotropyLevel = 16;
|
||||
SetupQuickTexHash();
|
||||
SetupTextureDecoder();
|
||||
#ifdef _XBOX
|
||||
// TODO: Maybe not? This decimates more often, but it may be speed harmful if unnecessary.
|
||||
lowMemoryMode_ = true;
|
||||
@ -286,21 +286,9 @@ void *TextureCacheDX9::UnswizzleFromMem(u32 texaddr, u32 bufw, u32 bytesPerPixel
|
||||
|
||||
u32 ydest = 0;
|
||||
if (rowWidth >= 16) {
|
||||
const u32 *src = (u32 *) Memory::GetPointer(texaddr);
|
||||
u32 *ydestp = tmpTexBuf32.data();
|
||||
for (int by = 0; by < byc; by++) {
|
||||
u32 *xdest = ydestp;
|
||||
for (int bx = 0; bx < bxc; bx++) {
|
||||
u32 *dest = xdest;
|
||||
for (int n = 0; n < 8; n++) {
|
||||
memcpy(dest, src, 16);
|
||||
dest += pitch;
|
||||
src += 4;
|
||||
}
|
||||
xdest += 4;
|
||||
}
|
||||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
// The most common one, so it gets an optimized implementation.
|
||||
DoUnswizzleTex16(Memory::GetPointer(texaddr), ydestp, bxc, byc, pitch, rowWidth);
|
||||
} else if (rowWidth == 8) {
|
||||
const u32 *src = (u32 *) Memory::GetPointer(texaddr);
|
||||
for (int by = 0; by < byc; by++) {
|
||||
|
@ -66,7 +66,7 @@ TextureCache::TextureCache() : clearCacheNextFrame_(false), lowMemoryMode_(false
|
||||
clutBufConverted_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
|
||||
clutBufRaw_ = (u32 *)AllocateAlignedMemory(4096 * sizeof(u32), 16); // 16KB
|
||||
glGetFloatv(GL_MAX_TEXTURE_MAX_ANISOTROPY_EXT, &maxAnisotropyLevel);
|
||||
SetupQuickTexHash();
|
||||
SetupTextureDecoder();
|
||||
}
|
||||
|
||||
TextureCache::~TextureCache() {
|
||||
@ -285,51 +285,6 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe
|
||||
}
|
||||
}
|
||||
|
||||
static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i *src = (const __m128i *)texptr;
|
||||
for (int by = 0; by < byc; by++) {
|
||||
__m128i *xdest = (__m128i *)ydestp;
|
||||
for (int bx = 0; bx < bxc; bx++) {
|
||||
__m128i *dest = xdest;
|
||||
for (int n = 0; n < 2; n++) {
|
||||
// Textures are always 16-byte aligned so this is fine.
|
||||
__m128i temp1 = _mm_load_si128(src);
|
||||
__m128i temp2 = _mm_load_si128(src + 1);
|
||||
__m128i temp3 = _mm_load_si128(src + 2);
|
||||
__m128i temp4 = _mm_load_si128(src + 3);
|
||||
_mm_store_si128(dest, temp1);
|
||||
dest += pitch >> 2;
|
||||
_mm_store_si128(dest, temp2);
|
||||
dest += pitch >> 2;
|
||||
_mm_store_si128(dest, temp3);
|
||||
dest += pitch >> 2;
|
||||
_mm_store_si128(dest, temp4);
|
||||
dest += pitch >> 2;
|
||||
src += 4;
|
||||
}
|
||||
xdest ++;
|
||||
}
|
||||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
#else
|
||||
const u32 *src = (const u32 *)texptr;
|
||||
for (int by = 0; by < byc; by++) {
|
||||
u32 *xdest = ydestp;
|
||||
for (int bx = 0; bx < bxc; bx++) {
|
||||
u32 *dest = xdest;
|
||||
for (int n = 0; n < 8; n++) {
|
||||
memcpy(dest, src, 16);
|
||||
dest += pitch;
|
||||
src += 4;
|
||||
}
|
||||
xdest += 4;
|
||||
}
|
||||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) {
|
||||
const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
|
||||
const u32 pitch = rowWidth / 4;
|
||||
@ -342,7 +297,7 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix
|
||||
if (rowWidth >= 16) {
|
||||
u32 *ydestp = tmpTexBuf32.data();
|
||||
// The most common one, so it gets an optimized implementation.
|
||||
Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth);
|
||||
DoUnswizzleTex16(texptr, ydestp, bxc, byc, pitch, rowWidth);
|
||||
} else if (rowWidth == 8) {
|
||||
const u32 *src = (const u32 *) texptr;
|
||||
for (int by = 0; by < byc; by++) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user