From e3a2569b532a567c43210934ed07b54ac0c63767 Mon Sep 17 00:00:00 2001 From: "gregory.hainaut@gmail.com" Date: Fri, 15 Oct 2010 12:18:35 +0000 Subject: [PATCH] GregMiscellaneous: zzogl-pg: * Fix previous commit :) git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3921 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/zzogl-pg/opengl/x86.cpp | 90 +++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/plugins/zzogl-pg/opengl/x86.cpp b/plugins/zzogl-pg/opengl/x86.cpp index a71dbfbb5..4fea70f80 100644 --- a/plugins/zzogl-pg/opengl/x86.cpp +++ b/plugins/zzogl-pg/opengl/x86.cpp @@ -649,14 +649,9 @@ static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff }; -template -__forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut) +template +void __fastcall WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut) { - // CSA 0-15 - // Replace lower 16 bits of clut with lower 16 bits of vm - // CSA 16-31 - // Replace higher 16 bits of clut with higher 16 bits of vm - __m128i vm_0; __m128i vm_1; __m128i vm_2; @@ -668,19 +663,47 @@ __forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut) __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask); - // load new data & remove useless part - if(CSA_0_15) { + // Note: + // !HIGH_16BITS_VM + // CSA in 0-15 -> Replace lower 16 bits of clut0 with lower 16 bits of vm + // CSA in 16-31 -> Replace higher 16 bits of clut0 with lower 16 bits of vm + + // HIGH_16BITS_VM + // CSA in 0-15 -> Replace lower 16 bits of clut0 with higher 16 bits of vm + // CSA in 16-31 -> Replace higher 16 bits of clut0 with higher 16 bits of vm + + if(HIGH_16BITS_VM && CSA_0_15) { + // move high to low + vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0 + vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2 + vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4 + vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6 + vm_0 = _mm_srli_epi32(vm_0, 16); + vm_1 = _mm_srli_epi32(vm_1, 16); + vm_2 = _mm_srli_epi32(vm_2, 16); + vm_3 = _mm_srli_epi32(vm_3, 16); + } else if(HIGH_16BITS_VM && !CSA_0_15) { + // Remove lower 16 bits + vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0 + vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2 + vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4 + vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6 + } else if(!HIGH_16BITS_VM && CSA_0_15) { // Remove higher 16 bits vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0 vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2 vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4 vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6 - } else { - // Remove lower 16 bits - vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); - vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); - vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); - vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); + } else if(!HIGH_16BITS_VM && !CSA_0_15) { + // move low to high + vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0 + vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2 + vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4 + vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6 + vm_0 = _mm_slli_epi32(vm_0, 16); + vm_1 = _mm_slli_epi32(vm_1, 16); + vm_2 = _mm_slli_epi32(vm_2, 16); + vm_3 = _mm_slli_epi32(vm_3, 16); } // Unsizzle the data @@ -719,9 +742,11 @@ __forceinline void WriteCLUT_T16_I4_CSM1_core_sse2(u32* vm, u32* clut) extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut) { if ((u32)clut & 0x0F) { - WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); + // CSA 16-31 && low 16bits vm + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); } else { - WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); + // CSA 0-15 && low 16bits vm + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); } } @@ -1078,23 +1103,36 @@ __forceinline void WriteCLUT_T16_I8_CSM1_sse2(u32* vm, u32 csa) { // update the right clut column (csa < 16) u32* clut = (u32*)(g_pbyGSClut + 64*(csa & 15)); - // u32 csa_right = (csa < 16) ? 16 - csa : 0; - u32 csa_right = 16 - csa; + u32 csa_right = (csa < 16) ? 16 - csa : 0; - for(int i = csa_right; i > 0 ; --i) { - WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); - vm += 16; // go down one column + for(int i = (csa_right/2); i > 0 ; --i) { + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); clut += 16; + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); + clut += 16; + vm += 16; // go down one column } // update the left clut column - clut = (u32*)(g_pbyGSClut); u32 csa_left = (csa >= 16) ? 16 : csa; - for(int i = csa_left; i > 0 ; --i) { - WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); - vm += 16; // go down one column + // In case csa_right is odd (so csa_left is also odd), we cross the clut column + if(csa_right & 0x1) { + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); + // go back to the base before processing left clut column + clut = (u32*)(g_pbyGSClut); + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); + } else if(csa_right != 0) { + // go back to the base before processing left clut column + clut = (u32*)(g_pbyGSClut); + } + + for(int i = (csa_left/2); i > 0 ; --i) { + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); clut += 16; + WriteCLUT_T16_I4_CSM1_core_sse2(vm, clut); + clut += 16; + vm += 16; // go down one column } }