VideoCommon: merged SSSE3/SSE4.1 codes. Added some additional SSSE3/SSE4.1 codes which will be used in "The Legend of Zelda: Twilight Princess".

These codes don't work unless "_M_SSE=0x301", for SSSE3, or "_M_SSE=0x401", for SSE4.1, is defined as a preprocessor definition.


git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5300 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
nodchip 2010-04-09 03:02:12 +00:00
parent 73caf37bca
commit 6136c94de5
4 changed files with 171 additions and 5 deletions

View File

@ -16,6 +16,13 @@
// http://code.google.com/p/dolphin-emu/ // http://code.google.com/p/dolphin-emu/
#include <cmath> #include <cmath>
#if _M_SSE >= 0x401
#include <smmintrin.h>
#elif _M_SSE >= 0x301
#include <tmmintrin.h>
#endif
#include "Common.h" #include "Common.h"
//#include "VideoCommon.h" // to get debug logs //#include "VideoCommon.h" // to get debug logs
@ -359,14 +366,35 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
} }
} }
#if _M_SSE >= 0x301
static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
#endif
inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
{ {
u16* tlut = (u16*)(texMem + tlutaddr); u16* tlut = (u16*)(texMem + tlutaddr);
#if _M_SSE >= 0x301
// Make 8 16-bits unsigned integer values
const __m128i a = _mm_set_epi16(tlut[src[7]], tlut[src[6]], tlut[src[5]], tlut[src[4]], tlut[src[3]], tlut[src[2]], tlut[src[1]], tlut[src[0]]);
// Apply Common::swap16() to 16-bits unsigned integers at once
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16);
// Store values to dst without polluting the caches
_mm_stream_si128((__m128i*)dst, b);
#else
for (int x = 0; x < 8; x++) for (int x = 0; x < 8; x++)
{ {
u8 val = src[x]; u8 val = src[x];
*dst++ = Common::swap16(tlut[val]); *dst++ = Common::swap16(tlut[val]);
} }
#endif
} }
@ -869,7 +897,9 @@ PC_TexFormat TexDecoder_DirectDecode_real(u8 *dst, const u8 *src, int width, int
return PC_TEX_FMT_NONE; return PC_TEX_FMT_NONE;
} }
#if _M_SSE >= 0x301
static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L);
#endif
//switch endianness, unswizzle //switch endianness, unswizzle
//TODO: to save memory, don't blindly convert everything to argb8888 //TODO: to save memory, don't blindly convert everything to argb8888
@ -997,13 +1027,59 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_RGBA8: // speed critical case GX_TF_RGBA8: // speed critical
{ {
#if _M_SSE >= 0x301
for (int y = 0; y < height; y += 4) {
__m128i* p = (__m128i*)(src + y * width * 4);
for (int x = 0; x < width; x += 4) {
#if _M_SSE >= 0x401
// Load 64-bytes at once.
const __m128i a0 = _mm_stream_load_si128(p++);
const __m128i a1 = _mm_stream_load_si128(p++);
const __m128i a2 = _mm_stream_load_si128(p++);
const __m128i a3 = _mm_stream_load_si128(p++);
#else
const __m128i a0 = _mm_load_si128(p++);
const __m128i a1 = _mm_load_si128(p++);
const __m128i a2 = _mm_load_si128(p++);
const __m128i a3 = _mm_load_si128(p++);
#endif
// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
// apply Common::swap32() by _mm_shuffle_epi8() and
// store them by _mm_stream_si128().
// See decodebytesARGB8_4() about the idea.
const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
}
}
#else
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4) for (int x = 0; x < width; x += 4)
{ {
for (int iy = 0; iy < 4; iy++) for (int iy = 0; iy < 4; iy++)
decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
src += 64; src += 64;
} }
#endif
} }
return PC_TEX_FMT_BGRA32; return PC_TEX_FMT_BGRA32;
case GX_TF_CMPR: // speed critical case GX_TF_CMPR: // speed critical

View File

@ -21,6 +21,10 @@
#include "VertexLoader_Normal.h" #include "VertexLoader_Normal.h"
#include "NativeVertexWriter.h" #include "NativeVertexWriter.h"
#if _M_SSE >= 0x301
#include <tmmintrin.h>
#endif
#define LOG_NORM8() // PRIM_LOG("norm: %f %f %f, ", ((s8*)VertexManager::s_pCurBufferPointer)[-3]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-2]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-1]/127.0f); #define LOG_NORM8() // PRIM_LOG("norm: %f %f %f, ", ((s8*)VertexManager::s_pCurBufferPointer)[-3]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-2]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-1]/127.0f);
#define LOG_NORM16() // PRIM_LOG("norm: %f %f %f, ", ((s16*)VertexManager::s_pCurBufferPointer)[-3]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-2]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-1]/32767.0f); #define LOG_NORM16() // PRIM_LOG("norm: %f %f %f, ", ((s16*)VertexManager::s_pCurBufferPointer)[-3]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-2]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-1]/32767.0f);
#define LOG_NORMF() // PRIM_LOG("norm: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[-3], ((float*)VertexManager::s_pCurBufferPointer)[-2], ((float*)VertexManager::s_pCurBufferPointer)[-1]); #define LOG_NORMF() // PRIM_LOG("norm: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[-3], ((float*)VertexManager::s_pCurBufferPointer)[-2], ((float*)VertexManager::s_pCurBufferPointer)[-1]);
@ -411,14 +415,30 @@ void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte_Expand16()
LOG_NORM16(); LOG_NORM16();
} }
#if _M_SSE >= 0x301
static const __m128i kMaskSwap16_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0FFFF0405L, 0x02030001L);
#endif
void LOADERDECL VertexLoader_Normal::Normal_Index16_Short() void LOADERDECL VertexLoader_Normal::Normal_Index16_Short()
{ {
u16 Index = DataReadU16(); u16 Index = DataReadU16();
const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL])); const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]));
#if _M_SSE >= 0x301
__m128i a = _mm_loadl_epi64((__m128i*)pData);
__m128i b = _mm_shuffle_epi8(a, kMaskSwap16_3);
_mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b);
#else
((u16*)VertexManager::s_pCurBufferPointer)[0] = Common::swap16(pData[0]); ((u16*)VertexManager::s_pCurBufferPointer)[0] = Common::swap16(pData[0]);
((u16*)VertexManager::s_pCurBufferPointer)[1] = Common::swap16(pData[1]); ((u16*)VertexManager::s_pCurBufferPointer)[1] = Common::swap16(pData[1]);
((u16*)VertexManager::s_pCurBufferPointer)[2] = Common::swap16(pData[2]); ((u16*)VertexManager::s_pCurBufferPointer)[2] = Common::swap16(pData[2]);
((u16*)VertexManager::s_pCurBufferPointer)[3] = 0; ((u16*)VertexManager::s_pCurBufferPointer)[3] = 0;
#endif
VertexManager::s_pCurBufferPointer += 8; VertexManager::s_pCurBufferPointer += 8;
LOG_NORM16(); LOG_NORM16();
} }

View File

@ -21,6 +21,10 @@
#include "VertexLoader_Position.h" #include "VertexLoader_Position.h"
#include "NativeVertexWriter.h" #include "NativeVertexWriter.h"
#if _M_SSE >= 301
#include <tmmintrin.h>
#endif
extern float posScale; extern float posScale;
extern TVtxAttr *pVtxAttr; extern TVtxAttr *pVtxAttr;
@ -146,16 +150,33 @@ inline void Pos_ReadIndex_Short(int Index)
VertexManager::s_pCurBufferPointer += 12; VertexManager::s_pCurBufferPointer += 12;
} }
#if _M_SSE >= 0x301
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
#endif
template<bool three> template<bool three>
inline void Pos_ReadIndex_Float(int Index) inline void Pos_ReadIndex_Float(int Index)
{ {
const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
#if _M_SSE >= 0x301
const __m128i a = _mm_loadu_si128((__m128i*)pData);
__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2);
_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b);
#else
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
if (three) if (three)
((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]); ((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
else else
((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f; ((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
#endif
LOG_VTX(); LOG_VTX();
VertexManager::s_pCurBufferPointer += 12; VertexManager::s_pCurBufferPointer += 12;
} }

View File

@ -21,6 +21,12 @@
#include "VertexLoader_TextCoord.h" #include "VertexLoader_TextCoord.h"
#include "NativeVertexWriter.h" #include "NativeVertexWriter.h"
#if _M_SSE >= 0x401
#include <smmintrin.h>
#elif _M_SSE >= 0x301
#include <tmmintrin.h>
#endif
#define LOG_TEX1() // PRIM_LOG("tex: %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0]); #define LOG_TEX1() // PRIM_LOG("tex: %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0]);
#define LOG_TEX2() // PRIM_LOG("tex: %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0], ((float*)VertexManager::s_pCurBufferPointer)[1]); #define LOG_TEX2() // PRIM_LOG("tex: %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0], ((float*)VertexManager::s_pCurBufferPointer)[1]);
@ -284,13 +290,35 @@ void LOADERDECL TexCoord_ReadIndex16_Short1()
VertexManager::s_pCurBufferPointer += 4; VertexManager::s_pCurBufferPointer += 4;
tcIndex++; tcIndex++;
} }
#if _M_SSE >= 0x401
static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L);
#endif
void LOADERDECL TexCoord_ReadIndex16_Short2() void LOADERDECL TexCoord_ReadIndex16_Short2()
{ {
// Heavy in ZWW // Heavy in ZWW
u16 Index = DataReadU16(); u16 Index = DataReadU16();
#if _M_SSE >= 0x401
const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
const __m128i a = _mm_cvtsi32_si128(*pData);
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2);
const __m128i c = _mm_cvtepi16_epi32(b);
const __m128 d = _mm_cvtepi32_ps(c);
const __m128 e = _mm_load1_ps(&tcScale[tcIndex]);
const __m128 f = _mm_mul_ps(d, e);
_mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f);
#else
const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex]; ((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex];
((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex]; ((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex];
#endif
LOG_TEX2(); LOG_TEX2();
VertexManager::s_pCurBufferPointer += 8; VertexManager::s_pCurBufferPointer += 8;
tcIndex++; tcIndex++;
@ -305,15 +333,36 @@ void LOADERDECL TexCoord_ReadIndex16_Float1()
VertexManager::s_pCurBufferPointer += 4; VertexManager::s_pCurBufferPointer += 4;
tcIndex++; tcIndex++;
} }
#if _M_SSE >= 0x301
static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
#endif
void LOADERDECL TexCoord_ReadIndex16_Float2() void LOADERDECL TexCoord_ReadIndex16_Float2()
{ {
u16 Index = DataReadU16(); u16 Index = DataReadU16();
const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
#if _M_SSE >= 0x301
const __m128i a = _mm_loadl_epi64((__m128i*)pData);
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32);
u8* p = VertexManager::s_pCurBufferPointer;
_mm_storel_epi64((__m128i*)p, b);
LOG_TEX2();
p += 8;
VertexManager::s_pCurBufferPointer = p;
tcIndex++;
#else
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
LOG_TEX2(); LOG_TEX2();
VertexManager::s_pCurBufferPointer += 8; VertexManager::s_pCurBufferPointer += 8;
tcIndex++; tcIndex++;
#endif
} }
ReadTexCoord tableReadTexCoord[4][8][2] = { ReadTexCoord tableReadTexCoord[4][8][2] = {