diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index ecd78ba16b..7cc0c05247 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -16,6 +16,13 @@ // http://code.google.com/p/dolphin-emu/ #include + +#if _M_SSE >= 0x401 +#include +#elif _M_SSE >= 0x301 +#include +#endif + #include "Common.h" //#include "VideoCommon.h" // to get debug logs @@ -359,14 +366,35 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) } } +#if _M_SSE >= 0x301 + static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); +#endif + inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); + +#if _M_SSE >= 0x301 + + // Make 8 16-bits unsigned integer values + const __m128i a = _mm_set_epi16(tlut[src[7]], tlut[src[6]], tlut[src[5]], tlut[src[4]], tlut[src[3]], tlut[src[2]], tlut[src[1]], tlut[src[0]]); + + // Apply Common::swap16() to 16-bits unsigned integers at once + const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16); + + // Store values to dst without polluting the caches + _mm_stream_si128((__m128i*)dst, b); + +#else + for (int x = 0; x < 8; x++) { u8 val = src[x]; *dst++ = Common::swap16(tlut[val]); } + +#endif + } @@ -869,7 +897,9 @@ PC_TexFormat TexDecoder_DirectDecode_real(u8 *dst, const u8 *src, int width, int return PC_TEX_FMT_NONE; } - +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L); +#endif //switch endianness, unswizzle //TODO: to save memory, don't blindly convert everything to argb8888 @@ -997,13 +1027,59 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh return PC_TEX_FMT_BGRA32; case GX_TF_RGBA8: // speed critical { + +#if _M_SSE >= 0x301 + + for (int y = 0; y < height; y += 4) { + __m128i* p = (__m128i*)(src + y * width * 4); + for (int x = 0; x < width; x += 4) { + +#if _M_SSE >= 0x401 + // Load 64-bytes at once. + const __m128i a0 = _mm_stream_load_si128(p++); + const __m128i a1 = _mm_stream_load_si128(p++); + const __m128i a2 = _mm_stream_load_si128(p++); + const __m128i a3 = _mm_stream_load_si128(p++); +#else + const __m128i a0 = _mm_load_si128(p++); + const __m128i a1 = _mm_load_si128(p++); + const __m128i a2 = _mm_load_si128(p++); + const __m128i a3 = _mm_load_si128(p++); +#endif + + // Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(), + // apply Common::swap32() by _mm_shuffle_epi8() and + // store them by _mm_stream_si128(). + // See decodebytesARGB8_4() about the idea. + const __m128i b0 = _mm_unpacklo_epi16(a0, a2); + const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0); + + const __m128i b1 = _mm_unpackhi_epi16(a0, a2); + const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1); + + const __m128i b2 = _mm_unpacklo_epi16(a1, a3); + const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2); + + const __m128i b3 = _mm_unpackhi_epi16(a1, a3); + const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3); + } + } + +#else + for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - { + for (int x = 0; x < width; x += 4) + { for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); + decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); src += 64; - } + } + +#endif } return PC_TEX_FMT_BGRA32; case GX_TF_CMPR: // speed critical diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp index bf38ef1e80..4cee83d9f6 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp @@ -21,6 +21,10 @@ #include "VertexLoader_Normal.h" #include "NativeVertexWriter.h" +#if _M_SSE >= 0x301 +#include +#endif + #define LOG_NORM8() // PRIM_LOG("norm: %f %f %f, ", ((s8*)VertexManager::s_pCurBufferPointer)[-3]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-2]/127.0f, ((s8*)VertexManager::s_pCurBufferPointer)[-1]/127.0f); #define LOG_NORM16() // PRIM_LOG("norm: %f %f %f, ", ((s16*)VertexManager::s_pCurBufferPointer)[-3]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-2]/32767.0f, ((s16*)VertexManager::s_pCurBufferPointer)[-1]/32767.0f); #define LOG_NORMF() // PRIM_LOG("norm: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[-3], ((float*)VertexManager::s_pCurBufferPointer)[-2], ((float*)VertexManager::s_pCurBufferPointer)[-1]); @@ -411,14 +415,30 @@ void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte_Expand16() LOG_NORM16(); } +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap16_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0FFFF0405L, 0x02030001L); +#endif + void LOADERDECL VertexLoader_Normal::Normal_Index16_Short() { u16 Index = DataReadU16(); const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL])); + +#if _M_SSE >= 0x301 + + __m128i a = _mm_loadl_epi64((__m128i*)pData); + __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_3); + _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); + +#else + ((u16*)VertexManager::s_pCurBufferPointer)[0] = Common::swap16(pData[0]); ((u16*)VertexManager::s_pCurBufferPointer)[1] = Common::swap16(pData[1]); ((u16*)VertexManager::s_pCurBufferPointer)[2] = Common::swap16(pData[2]); ((u16*)VertexManager::s_pCurBufferPointer)[3] = 0; + +#endif + VertexManager::s_pCurBufferPointer += 8; LOG_NORM16(); } diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp index aa193d4955..6ff0884cec 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp @@ -21,6 +21,10 @@ #include "VertexLoader_Position.h" #include "NativeVertexWriter.h" +#if _M_SSE >= 301 +#include +#endif + extern float posScale; extern TVtxAttr *pVtxAttr; @@ -146,16 +150,33 @@ inline void Pos_ReadIndex_Short(int Index) VertexManager::s_pCurBufferPointer += 12; } +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); +static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); +#endif + template inline void Pos_ReadIndex_Float(int Index) { const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); + +#if _M_SSE >= 0x301 + + const __m128i a = _mm_loadu_si128((__m128i*)pData); + __m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2); + _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); + +#else + ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); if (three) ((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]); else ((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f; + +#endif + LOG_VTX(); VertexManager::s_pCurBufferPointer += 12; } diff --git a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp index e6720e81a0..ae4e6d0bb8 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp @@ -21,6 +21,12 @@ #include "VertexLoader_TextCoord.h" #include "NativeVertexWriter.h" +#if _M_SSE >= 0x401 +#include +#elif _M_SSE >= 0x301 +#include +#endif + #define LOG_TEX1() // PRIM_LOG("tex: %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0]); #define LOG_TEX2() // PRIM_LOG("tex: %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[0], ((float*)VertexManager::s_pCurBufferPointer)[1]); @@ -284,13 +290,35 @@ void LOADERDECL TexCoord_ReadIndex16_Short1() VertexManager::s_pCurBufferPointer += 4; tcIndex++; } + +#if _M_SSE >= 0x401 +static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L); +#endif + void LOADERDECL TexCoord_ReadIndex16_Short2() { // Heavy in ZWW u16 Index = DataReadU16(); + +#if _M_SSE >= 0x401 + + const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + const __m128i a = _mm_cvtsi32_si128(*pData); + const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); + const __m128i c = _mm_cvtepi16_epi32(b); + const __m128 d = _mm_cvtepi32_ps(c); + const __m128 e = _mm_load1_ps(&tcScale[tcIndex]); + const __m128 f = _mm_mul_ps(d, e); + _mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f); + +#else + const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); ((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex]; ((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex]; + +#endif + LOG_TEX2(); VertexManager::s_pCurBufferPointer += 8; tcIndex++; @@ -305,15 +333,36 @@ void LOADERDECL TexCoord_ReadIndex16_Float1() VertexManager::s_pCurBufferPointer += 4; tcIndex++; } + +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); +#endif + void LOADERDECL TexCoord_ReadIndex16_Float2() { u16 Index = DataReadU16(); const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + +#if _M_SSE >= 0x301 + + const __m128i a = _mm_loadl_epi64((__m128i*)pData); + const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32); + u8* p = VertexManager::s_pCurBufferPointer; + _mm_storel_epi64((__m128i*)p, b); + LOG_TEX2(); + p += 8; + VertexManager::s_pCurBufferPointer = p; + tcIndex++; + +#else + ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); LOG_TEX2(); VertexManager::s_pCurBufferPointer += 8; tcIndex++; + +#endif } ReadTexCoord tableReadTexCoord[4][8][2] = {