mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-06 02:01:59 +00:00
VideoCommon: Added automatic selection routines for SSSE3/SSE4.1 codes. It selects SSSE3/SSE4.1 codes only if a proper preprocessor definition is defined and the target cpu supports SSSE3/SSE4.1. The selection routines in VertexLoader_* use function pointers. TextureDecoder uses a combination of "#if" and "if" statements.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5302 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
91c6f5acba
commit
956b8eb54d
@ -366,37 +366,41 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
|
||||
}
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
|
||||
#endif
|
||||
|
||||
inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
|
||||
{
|
||||
u16* tlut = (u16*)(texMem + tlutaddr);
|
||||
for (int x = 0; x < 8; x++)
|
||||
{
|
||||
u8 val = src[x];
|
||||
*dst++ = Common::swap16(tlut[val]);
|
||||
}
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
|
||||
|
||||
inline void decodebytesC8_To_Raw16_SSSE3(u16* dst, const u8* src, int tlutaddr)
|
||||
{
|
||||
u16* tlut = (u16*)(texMem + tlutaddr);
|
||||
|
||||
// Make 8 16-bits unsigned integer values
|
||||
const __m128i a = _mm_set_epi16(tlut[src[7]], tlut[src[6]], tlut[src[5]], tlut[src[4]], tlut[src[3]], tlut[src[2]], tlut[src[1]], tlut[src[0]]);
|
||||
__m128i a = _mm_setzero_si128();
|
||||
a = _mm_insert_epi16(a, tlut[src[0]], 0);
|
||||
a = _mm_insert_epi16(a, tlut[src[1]], 1);
|
||||
a = _mm_insert_epi16(a, tlut[src[2]], 2);
|
||||
a = _mm_insert_epi16(a, tlut[src[3]], 3);
|
||||
a = _mm_insert_epi16(a, tlut[src[4]], 4);
|
||||
a = _mm_insert_epi16(a, tlut[src[5]], 5);
|
||||
a = _mm_insert_epi16(a, tlut[src[6]], 6);
|
||||
a = _mm_insert_epi16(a, tlut[src[7]], 7);
|
||||
|
||||
// Apply Common::swap16() to 16-bits unsigned integers at once
|
||||
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16);
|
||||
|
||||
// Store values to dst without polluting the caches
|
||||
_mm_stream_si128((__m128i*)dst, b);
|
||||
|
||||
#else
|
||||
|
||||
for (int x = 0; x < 8; x++)
|
||||
{
|
||||
u8 val = src[x];
|
||||
*dst++ = Common::swap16(tlut[val]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr)
|
||||
{
|
||||
@ -958,10 +962,26 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
if (cpu_info.bSSSE3) {
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src, tlutaddr);
|
||||
break;
|
||||
} else
|
||||
|
||||
#endif
|
||||
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
|
||||
|
||||
}
|
||||
}
|
||||
return GetPCFormatFromTLUTFormat(tlutfmt);
|
||||
case GX_TF_IA4:
|
||||
@ -1028,59 +1048,93 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
|
||||
case GX_TF_RGBA8: // speed critical
|
||||
{
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
for (int y = 0; y < height; y += 4) {
|
||||
__m128i* p = (__m128i*)(src + y * width * 4);
|
||||
for (int x = 0; x < width; x += 4) {
|
||||
// FIXME(nodchip): the following code is too complicated.
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
// Load 64-bytes at once.
|
||||
const __m128i a0 = _mm_stream_load_si128(p++);
|
||||
const __m128i a1 = _mm_stream_load_si128(p++);
|
||||
const __m128i a2 = _mm_stream_load_si128(p++);
|
||||
const __m128i a3 = _mm_stream_load_si128(p++);
|
||||
#else
|
||||
const __m128i a0 = _mm_load_si128(p++);
|
||||
const __m128i a1 = _mm_load_si128(p++);
|
||||
const __m128i a2 = _mm_load_si128(p++);
|
||||
const __m128i a3 = _mm_load_si128(p++);
|
||||
|
||||
if (cpu_info.bSSE4_1) {
|
||||
for (int y = 0; y < height; y += 4) {
|
||||
__m128i* p = (__m128i*)(src + y * width * 4);
|
||||
for (int x = 0; x < width; x += 4) {
|
||||
|
||||
// Load 64-bytes at once.
|
||||
const __m128i a0 = _mm_stream_load_si128(p++);
|
||||
const __m128i a1 = _mm_stream_load_si128(p++);
|
||||
const __m128i a2 = _mm_stream_load_si128(p++);
|
||||
const __m128i a3 = _mm_stream_load_si128(p++);
|
||||
|
||||
// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
|
||||
// apply Common::swap32() by _mm_shuffle_epi8() and
|
||||
// store them by _mm_stream_si128().
|
||||
// See decodebytesARGB8_4() about the idea.
|
||||
const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
|
||||
const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
|
||||
|
||||
const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
|
||||
const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
|
||||
|
||||
const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
|
||||
const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
|
||||
|
||||
const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
|
||||
const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
|
||||
}
|
||||
}
|
||||
} else
|
||||
|
||||
#endif
|
||||
|
||||
// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
|
||||
// apply Common::swap32() by _mm_shuffle_epi8() and
|
||||
// store them by _mm_stream_si128().
|
||||
// See decodebytesARGB8_4() about the idea.
|
||||
const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
|
||||
const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
|
||||
const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
|
||||
if (cpu_info.bSSSE3) {
|
||||
for (int y = 0; y < height; y += 4) {
|
||||
__m128i* p = (__m128i*)(src + y * width * 4);
|
||||
for (int x = 0; x < width; x += 4) {
|
||||
|
||||
const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
|
||||
const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
|
||||
const __m128i a0 = _mm_load_si128(p++);
|
||||
const __m128i a1 = _mm_load_si128(p++);
|
||||
const __m128i a2 = _mm_load_si128(p++);
|
||||
const __m128i a3 = _mm_load_si128(p++);
|
||||
|
||||
const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
|
||||
const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
|
||||
// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
|
||||
// apply Common::swap32() by _mm_shuffle_epi8() and
|
||||
// store them by _mm_stream_si128().
|
||||
// See decodebytesARGB8_4() about the idea.
|
||||
const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
|
||||
const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
|
||||
|
||||
const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
|
||||
const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
|
||||
|
||||
const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
|
||||
const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
|
||||
|
||||
const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
|
||||
const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
|
||||
_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
|
||||
}
|
||||
}
|
||||
} else
|
||||
|
||||
#endif
|
||||
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 4)
|
||||
{
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
|
||||
src += 64;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 4)
|
||||
{
|
||||
for (int iy = 0; iy < 4; iy++)
|
||||
decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
|
||||
src += 64;
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return PC_TEX_FMT_BGRA32;
|
||||
case GX_TF_CMPR: // speed critical
|
||||
// The metroid games use this format almost exclusively.
|
||||
|
@ -175,6 +175,8 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
|
||||
m_NativeFmt = NativeVertexFormat::Create();
|
||||
loop_counter = 0;
|
||||
VertexLoader_Normal::Init();
|
||||
VertexLoader_Position::Init();
|
||||
VertexLoader_TextCoord::Init();
|
||||
|
||||
m_VtxDesc = vtx_desc;
|
||||
SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex);
|
||||
@ -268,8 +270,8 @@ void VertexLoader::CompileVertexTranslator()
|
||||
_assert_msg_(VIDEO, FORMAT_UBYTE <= m_VtxAttr.PosFormat && m_VtxAttr.PosFormat <= FORMAT_FLOAT, "Invalid vertex position format!\n(m_VtxAttr.PosFormat = %d)", m_VtxAttr.PosFormat);
|
||||
_assert_msg_(VIDEO, 0 <= m_VtxAttr.PosElements && m_VtxAttr.PosElements <= 1, "Invalid number of vertex position elemnts!\n(m_VtxAttr.PosElements = %d)", m_VtxAttr.PosElements);
|
||||
|
||||
WriteCall(tableReadPosition[m_VtxDesc.Position][m_VtxAttr.PosFormat][m_VtxAttr.PosElements]);
|
||||
m_VertexSize += tableReadPositionVertexSize[m_VtxDesc.Position][m_VtxAttr.PosFormat][m_VtxAttr.PosElements];
|
||||
WriteCall(VertexLoader_Position::GetFunction(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements));
|
||||
m_VertexSize += VertexLoader_Position::GetSize(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements);
|
||||
nat_offset += 12;
|
||||
|
||||
// OK, so we just got a point. Let's go back and read it for the bounding box.
|
||||
@ -423,8 +425,8 @@ void VertexLoader::CompileVertexTranslator()
|
||||
_assert_msg_(VIDEO, 0 <= elements && elements <= 1, "Invalid number of texture coordinates elemnts!\n(elements = %d)", elements);
|
||||
|
||||
m_NativeFmt->m_components |= VB_HAS_UV0 << i;
|
||||
WriteCall(tableReadTexCoord[tc[i]][format][elements]);
|
||||
m_VertexSize += tableReadTexCoordVertexSize[tc[i]][format][elements];
|
||||
WriteCall(VertexLoader_TextCoord::GetFunction(tc[i], format, elements));
|
||||
m_VertexSize += VertexLoader_TextCoord::GetSize(tc[i], format, elements);
|
||||
}
|
||||
|
||||
if (m_NativeFmt->m_components & (VB_HAS_TEXMTXIDX0 << i)) {
|
||||
@ -459,7 +461,7 @@ void VertexLoader::CompileVertexTranslator()
|
||||
int j = i + 1;
|
||||
for (; j < 8; ++j) {
|
||||
if (tc[j] != NOT_PRESENT) {
|
||||
WriteCall(TexCoord_Read_Dummy); // important to get indices right!
|
||||
WriteCall(VertexLoader_TextCoord::GetDummyFunction()); // important to get indices right!
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "VertexLoader.h"
|
||||
#include "VertexLoader_Normal.h"
|
||||
#include "NativeVertexWriter.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
#include <tmmintrin.h>
|
||||
@ -114,6 +115,18 @@ void VertexLoader_Normal::Init(void)
|
||||
m_TableExpand16[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Set(2, Normal_Index16_Byte3_Indices1_Expand16);
|
||||
m_TableExpand16[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_BYTE] = Set(2, Normal_Index16_Byte_Expand16);
|
||||
m_TableExpand16[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Set(6, Normal_Index16_Byte3_Indices3_Expand16);
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
if (cpu_info.bSSSE3) {
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_USHORT] = Set(2, Normal_Index16_Short_SSSE3); //HACK
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_SHORT] = Set(2, Normal_Index16_Short_SSSE3);
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_USHORT] = Set(2, Normal_Index16_Short_SSSE3); //HACK
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_SHORT] = Set(2, Normal_Index16_Short_SSSE3);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
unsigned int VertexLoader_Normal::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements, unsigned int _index3)
|
||||
@ -415,34 +428,33 @@ void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte_Expand16()
|
||||
LOG_NORM16();
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap16_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0FFFF0405L, 0x02030001L);
|
||||
#endif
|
||||
|
||||
void LOADERDECL VertexLoader_Normal::Normal_Index16_Short()
|
||||
{
|
||||
u16 Index = DataReadU16();
|
||||
const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]));
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
__m128i a = _mm_loadl_epi64((__m128i*)pData);
|
||||
__m128i b = _mm_shuffle_epi8(a, kMaskSwap16_3);
|
||||
_mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b);
|
||||
|
||||
#else
|
||||
|
||||
((u16*)VertexManager::s_pCurBufferPointer)[0] = Common::swap16(pData[0]);
|
||||
((u16*)VertexManager::s_pCurBufferPointer)[1] = Common::swap16(pData[1]);
|
||||
((u16*)VertexManager::s_pCurBufferPointer)[2] = Common::swap16(pData[2]);
|
||||
((u16*)VertexManager::s_pCurBufferPointer)[3] = 0;
|
||||
|
||||
#endif
|
||||
|
||||
VertexManager::s_pCurBufferPointer += 8;
|
||||
LOG_NORM16();
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap16_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0FFFF0405L, 0x02030001L);
|
||||
|
||||
void LOADERDECL VertexLoader_Normal::Normal_Index16_Short_SSSE3()
|
||||
{
|
||||
u16 Index = DataReadU16();
|
||||
const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL]));
|
||||
__m128i a = _mm_loadl_epi64((__m128i*)pData);
|
||||
__m128i b = _mm_shuffle_epi8(a, kMaskSwap16_3);
|
||||
_mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b);
|
||||
VertexManager::s_pCurBufferPointer += 8;
|
||||
LOG_NORM16();
|
||||
}
|
||||
#endif
|
||||
|
||||
void LOADERDECL VertexLoader_Normal::Normal_Index16_Float()
|
||||
{
|
||||
u16 Index = DataReadU16();
|
||||
|
@ -117,6 +117,11 @@ private:
|
||||
static void LOADERDECL Normal_Index16_Byte3_Indices3_Expand16();
|
||||
static void LOADERDECL Normal_Index16_Short3_Indices3();
|
||||
static void LOADERDECL Normal_Index16_Float3_Indices3();
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static void LOADERDECL Normal_Index16_Short_SSSE3();
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -20,8 +20,9 @@
|
||||
#include "VertexLoader.h"
|
||||
#include "VertexLoader_Position.h"
|
||||
#include "NativeVertexWriter.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#if _M_SSE >= 301
|
||||
#if _M_SSE >= 0x301
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
@ -150,37 +151,36 @@ inline void Pos_ReadIndex_Short(int Index)
|
||||
VertexManager::s_pCurBufferPointer += 12;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
|
||||
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
#endif
|
||||
|
||||
template<bool three>
|
||||
inline void Pos_ReadIndex_Float(int Index)
|
||||
{
|
||||
const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
const __m128i a = _mm_loadu_si128((__m128i*)pData);
|
||||
__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2);
|
||||
_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b);
|
||||
|
||||
#else
|
||||
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
|
||||
if (three)
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]);
|
||||
else
|
||||
((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f;
|
||||
|
||||
#endif
|
||||
|
||||
LOG_VTX();
|
||||
VertexManager::s_pCurBufferPointer += 12;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
|
||||
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
|
||||
template<bool three>
|
||||
inline void Pos_ReadIndex_Float_SSSE3(int Index)
|
||||
{
|
||||
const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION]));
|
||||
const __m128i a = _mm_loadu_si128((__m128i*)pData);
|
||||
__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2);
|
||||
_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b);
|
||||
LOG_VTX();
|
||||
VertexManager::s_pCurBufferPointer += 12;
|
||||
}
|
||||
#endif
|
||||
|
||||
// ==============================================================================
|
||||
// Index 8
|
||||
// ==============================================================================
|
||||
@ -209,7 +209,14 @@ void LOADERDECL Pos_ReadIndex16_UShort2() {Pos_ReadIndex_Short<u16, false>(DataR
|
||||
void LOADERDECL Pos_ReadIndex16_Short2() {Pos_ReadIndex_Short<s16, false>(DataReadU16());}
|
||||
void LOADERDECL Pos_ReadIndex16_Float2() {Pos_ReadIndex_Float<false> (DataReadU16());}
|
||||
|
||||
ReadPosision tableReadPosition[4][8][2] = {
|
||||
#if _M_SSE >= 0x301
|
||||
void LOADERDECL Pos_ReadIndex8_Float3_SSSE3() {Pos_ReadIndex_Float_SSSE3<true> (DataReadU8());}
|
||||
void LOADERDECL Pos_ReadIndex8_Float2_SSSE3() {Pos_ReadIndex_Float_SSSE3<false> (DataReadU8());}
|
||||
void LOADERDECL Pos_ReadIndex16_Float3_SSSE3() {Pos_ReadIndex_Float_SSSE3<true> (DataReadU16());}
|
||||
void LOADERDECL Pos_ReadIndex16_Float2_SSSE3() {Pos_ReadIndex_Float_SSSE3<false> (DataReadU16());}
|
||||
#endif
|
||||
|
||||
static TPipelineFunction tableReadPosition[4][8][2] = {
|
||||
{
|
||||
{NULL, NULL,},
|
||||
{NULL, NULL,},
|
||||
@ -240,7 +247,7 @@ ReadPosision tableReadPosition[4][8][2] = {
|
||||
},
|
||||
};
|
||||
|
||||
int tableReadPositionVertexSize[4][8][2] = {
|
||||
static int tableReadPositionVertexSize[4][8][2] = {
|
||||
{
|
||||
{0, 0,},
|
||||
{0, 0,},
|
||||
@ -271,3 +278,26 @@ int tableReadPositionVertexSize[4][8][2] = {
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
void VertexLoader_Position::Init(void) {
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
if (cpu_info.bSSSE3) {
|
||||
tableReadPosition[2][4][0] = Pos_ReadIndex8_Float2_SSSE3;
|
||||
tableReadPosition[2][4][1] = Pos_ReadIndex8_Float3_SSSE3;
|
||||
tableReadPosition[3][4][0] = Pos_ReadIndex16_Float2_SSSE3;
|
||||
tableReadPosition[3][4][1] = Pos_ReadIndex16_Float3_SSSE3;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
unsigned int VertexLoader_Position::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements) {
|
||||
return tableReadPositionVertexSize[_type][_format][_elements];
|
||||
}
|
||||
|
||||
TPipelineFunction VertexLoader_Position::GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements) {
|
||||
return tableReadPosition[_type][_format][_elements];
|
||||
}
|
||||
|
@ -18,17 +18,17 @@
|
||||
#ifndef VERTEXLOADER_POSITION_H
|
||||
#define VERTEXLOADER_POSITION_H
|
||||
|
||||
typedef void (LOADERDECL *ReadPosision)();
|
||||
class VertexLoader_Position {
|
||||
public:
|
||||
|
||||
// Hold function pointers of vertex loaders.
|
||||
// The first dimension corresponds to TVtxDesc.Position.
|
||||
// The second dimension corresponds to TVtxAttr.PosFormat.
|
||||
// The third dimension corresponds to TVtxAttr.PosElements.
|
||||
// The dimensions are aligned to 2^n for speed up.
|
||||
extern ReadPosision tableReadPosition[4][8][2];
|
||||
// Init
|
||||
static void Init(void);
|
||||
|
||||
// Hold vertex size of each vertex format.
|
||||
// The dimensions are same as tableReadPosition.
|
||||
extern int tableReadPositionVertexSize[4][8][2];
|
||||
// GetSize
|
||||
static unsigned int GetSize(unsigned int _type, unsigned int _format, unsigned int _elements);
|
||||
|
||||
// GetFunction
|
||||
static TPipelineFunction GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "VertexLoader.h"
|
||||
#include "VertexLoader_TextCoord.h"
|
||||
#include "NativeVertexWriter.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
@ -291,17 +292,25 @@ void LOADERDECL TexCoord_ReadIndex16_Short1()
|
||||
tcIndex++;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L);
|
||||
#endif
|
||||
|
||||
void LOADERDECL TexCoord_ReadIndex16_Short2()
|
||||
{
|
||||
// Heavy in ZWW
|
||||
u16 Index = DataReadU16();
|
||||
u16 Index = DataReadU16();
|
||||
const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex];
|
||||
((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex];
|
||||
LOG_TEX2();
|
||||
VertexManager::s_pCurBufferPointer += 8;
|
||||
tcIndex++;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L);
|
||||
|
||||
void LOADERDECL TexCoord_ReadIndex16_Short2_SSE4()
|
||||
{
|
||||
// Heavy in ZWW
|
||||
u16 Index = DataReadU16();
|
||||
const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
const __m128i a = _mm_cvtsi32_si128(*pData);
|
||||
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2);
|
||||
@ -310,19 +319,11 @@ void LOADERDECL TexCoord_ReadIndex16_Short2()
|
||||
const __m128 e = _mm_load1_ps(&tcScale[tcIndex]);
|
||||
const __m128 f = _mm_mul_ps(d, e);
|
||||
_mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f);
|
||||
|
||||
#else
|
||||
|
||||
const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex];
|
||||
((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex];
|
||||
|
||||
#endif
|
||||
|
||||
LOG_TEX2();
|
||||
VertexManager::s_pCurBufferPointer += 8;
|
||||
tcIndex++;
|
||||
}
|
||||
#endif
|
||||
|
||||
void LOADERDECL TexCoord_ReadIndex16_Float1()
|
||||
{
|
||||
@ -334,17 +335,24 @@ void LOADERDECL TexCoord_ReadIndex16_Float1()
|
||||
tcIndex++;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
#endif
|
||||
|
||||
void LOADERDECL TexCoord_ReadIndex16_Float2()
|
||||
void LOADERDECL TexCoord_ReadIndex16_Float2()
|
||||
{
|
||||
u16 Index = DataReadU16();
|
||||
const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
|
||||
LOG_TEX2();
|
||||
VertexManager::s_pCurBufferPointer += 8;
|
||||
tcIndex++;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
|
||||
void LOADERDECL TexCoord_ReadIndex16_Float2_SSSE3()
|
||||
{
|
||||
u16 Index = DataReadU16();
|
||||
const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
const __m128i a = _mm_loadl_epi64((__m128i*)pData);
|
||||
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32);
|
||||
u8* p = VertexManager::s_pCurBufferPointer;
|
||||
@ -353,19 +361,10 @@ void LOADERDECL TexCoord_ReadIndex16_Float2()
|
||||
p += 8;
|
||||
VertexManager::s_pCurBufferPointer = p;
|
||||
tcIndex++;
|
||||
|
||||
#else
|
||||
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]);
|
||||
((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]);
|
||||
LOG_TEX2();
|
||||
VertexManager::s_pCurBufferPointer += 8;
|
||||
tcIndex++;
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
ReadTexCoord tableReadTexCoord[4][8][2] = {
|
||||
static TPipelineFunction tableReadTexCoord[4][8][2] = {
|
||||
{
|
||||
{NULL, NULL,},
|
||||
{NULL, NULL,},
|
||||
@ -396,7 +395,7 @@ ReadTexCoord tableReadTexCoord[4][8][2] = {
|
||||
},
|
||||
};
|
||||
|
||||
int tableReadTexCoordVertexSize[4][8][2] = {
|
||||
static int tableReadTexCoordVertexSize[4][8][2] = {
|
||||
{
|
||||
{0, 0,},
|
||||
{0, 0,},
|
||||
@ -426,3 +425,35 @@ int tableReadTexCoordVertexSize[4][8][2] = {
|
||||
{2, 2,},
|
||||
},
|
||||
};
|
||||
|
||||
void VertexLoader_TextCoord::Init(void) {
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
if (cpu_info.bSSSE3) {
|
||||
tableReadTexCoord[3][4][1] = TexCoord_ReadIndex16_Float2_SSSE3;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if (cpu_info.bSSE4_1) {
|
||||
tableReadTexCoord[3][3][1] = TexCoord_ReadIndex16_Short2_SSE4;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
unsigned int VertexLoader_TextCoord::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements) {
|
||||
return tableReadTexCoordVertexSize[_type][_format][_elements];
|
||||
}
|
||||
|
||||
TPipelineFunction VertexLoader_TextCoord::GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements) {
|
||||
return tableReadTexCoord[_type][_format][_elements];
|
||||
}
|
||||
|
||||
TPipelineFunction VertexLoader_TextCoord::GetDummyFunction() {
|
||||
return TexCoord_Read_Dummy;
|
||||
}
|
||||
|
@ -20,19 +20,22 @@
|
||||
|
||||
#include "NativeVertexFormat.h"
|
||||
|
||||
typedef void (LOADERDECL *ReadTexCoord)();
|
||||
class VertexLoader_TextCoord
|
||||
{
|
||||
public:
|
||||
|
||||
// Hold function pointers of texture coordinates loaders.
|
||||
// The first dimension corresponds to TVtxDesc.Tex?Coord.
|
||||
// The second dimension corresponds to TVtxAttr.texCoord[?].Format.
|
||||
// The third dimension corresponds to TVtxAttr.texCoord[?].Elements.
|
||||
// The dimensions are aligned to 2^n for speed up.
|
||||
extern ReadTexCoord tableReadTexCoord[4][8][2];
|
||||
// Init
|
||||
static void Init(void);
|
||||
|
||||
// Hold vertex size of each vertex format.
|
||||
// The dimensions are same as tableReadPosition.
|
||||
extern int tableReadTexCoordVertexSize[4][8][2];
|
||||
// GetSize
|
||||
static unsigned int GetSize(unsigned int _type, unsigned int _format, unsigned int _elements);
|
||||
|
||||
void LOADERDECL TexCoord_Read_Dummy();
|
||||
// GetFunction
|
||||
static TPipelineFunction GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements);
|
||||
|
||||
// GetDummyFunction
|
||||
// It is important to synchronize tcIndex.
|
||||
static TPipelineFunction GetDummyFunction();
|
||||
};
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user