mirror of
https://github.com/libretro/ppsspp.git
synced 2024-12-01 12:20:24 +00:00
Merge pull request #7751 from unknownbrackets/colorconv
A bit more SSE color conversion funcs, stub out NEON space
This commit is contained in:
commit
5b75d5d33f
@ -290,7 +290,8 @@ set(CommonExtra)
|
||||
if(ARM)
|
||||
set(CommonExtra ${CommonExtra}
|
||||
Common/ArmCPUDetect.cpp
|
||||
Common/ArmThunk.cpp)
|
||||
Common/ArmThunk.cpp
|
||||
Common/ColorConvNEON.cpp)
|
||||
elseif(X86)
|
||||
set(CommonExtra ${CommonExtra}
|
||||
Common/ABI.cpp
|
||||
|
@ -15,9 +15,15 @@
|
||||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
|
||||
#include "ColorConv.h"
|
||||
#include "CommonTypes.h"
|
||||
// NEON is in a separate file so that it can be compiled with a runtime check.
|
||||
#include "ColorConvNEON.h"
|
||||
#include "Common.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#ifdef _M_SSE
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
inline u16 RGBA8888toRGB565(u32 px) {
|
||||
return ((px >> 3) & 0x001F) | ((px >> 5) & 0x07E0) | ((px >> 8) & 0xF800);
|
||||
@ -269,8 +275,51 @@ void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, const u32 numPixels) {
|
||||
}
|
||||
|
||||
void ConvertRGBA565ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i mask5 = _mm_set1_epi16(0x001f);
|
||||
const __m128i mask6 = _mm_set1_epi16(0x003f);
|
||||
const __m128i mask8 = _mm_set1_epi16(0x00ff);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst32;
|
||||
u32 sseChunks = numPixels / 8;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
const __m128i c = _mm_load_si128(&srcp[i]);
|
||||
|
||||
// Swizzle, resulting in RR00 RR00.
|
||||
__m128i r = _mm_and_si128(c, mask5);
|
||||
r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
|
||||
r = _mm_and_si128(r, mask8);
|
||||
|
||||
// This one becomes 00GG 00GG.
|
||||
__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask6);
|
||||
g = _mm_or_si128(_mm_slli_epi16(g, 2), _mm_srli_epi16(g, 4));
|
||||
g = _mm_slli_epi16(g, 8);
|
||||
|
||||
// Almost done, we aim for BB00 BB00 again here.
|
||||
__m128i b = _mm_and_si128(_mm_srli_epi16(c, 11), mask5);
|
||||
b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
|
||||
b = _mm_and_si128(b, mask8);
|
||||
|
||||
// Always set to 00FF 00FF.
|
||||
__m128i a = _mm_slli_epi16(mask8, 8);
|
||||
|
||||
// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
|
||||
const __m128i rg = _mm_or_si128(r, g);
|
||||
const __m128i ba = _mm_or_si128(b, a);
|
||||
_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
|
||||
_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
|
||||
}
|
||||
u32 i = sseChunks * 8;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
|
||||
u8 *dst = (u8 *)dst32;
|
||||
for (u32 x = 0; x < numPixels; x++) {
|
||||
for (u32 x = i; x < numPixels; x++) {
|
||||
u16 col = src[x];
|
||||
dst[x * 4] = Convert5To8((col) & 0x1f);
|
||||
dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
|
||||
@ -280,8 +329,52 @@ void ConvertRGBA565ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels) {
|
||||
}
|
||||
|
||||
void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i mask5 = _mm_set1_epi16(0x001f);
|
||||
const __m128i mask8 = _mm_set1_epi16(0x00ff);
|
||||
const __m128i one = _mm_set1_epi16(0x0001);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst32;
|
||||
u32 sseChunks = numPixels / 8;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
const __m128i c = _mm_load_si128(&srcp[i]);
|
||||
|
||||
// Swizzle, resulting in RR00 RR00.
|
||||
__m128i r = _mm_and_si128(c, mask5);
|
||||
r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
|
||||
r = _mm_and_si128(r, mask8);
|
||||
|
||||
// This one becomes 00GG 00GG.
|
||||
__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask5);
|
||||
g = _mm_or_si128(_mm_slli_epi16(g, 3), _mm_srli_epi16(g, 2));
|
||||
g = _mm_slli_epi16(g, 8);
|
||||
|
||||
// Almost done, we aim for BB00 BB00 again here.
|
||||
__m128i b = _mm_and_si128(_mm_srli_epi16(c, 10), mask5);
|
||||
b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
|
||||
b = _mm_and_si128(b, mask8);
|
||||
|
||||
// 1 bit A to 00AA 00AA.
|
||||
__m128i a = _mm_srli_epi16(c, 15);
|
||||
a = _mm_slli_epi16(_mm_cmpeq_epi16(a, one), 8);
|
||||
|
||||
// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
|
||||
const __m128i rg = _mm_or_si128(r, g);
|
||||
const __m128i ba = _mm_or_si128(b, a);
|
||||
_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
|
||||
_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
|
||||
}
|
||||
u32 i = sseChunks * 8;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
|
||||
u8 *dst = (u8 *)dst32;
|
||||
for (u32 x = 0; x < numPixels; x++) {
|
||||
for (u32 x = i; x < numPixels; x++) {
|
||||
u16 col = src[x];
|
||||
dst[x * 4] = Convert5To8((col) & 0x1f);
|
||||
dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f);
|
||||
@ -290,9 +383,50 @@ void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This seems to be BGRA4444 -> RGBA888?
|
||||
void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i mask4 = _mm_set1_epi16(0x000f);
|
||||
const __m128i mask8 = _mm_set1_epi16(0x00ff);
|
||||
const __m128i one = _mm_set1_epi16(0x0001);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst32;
|
||||
u32 sseChunks = numPixels / 8;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
const __m128i c = _mm_load_si128(&srcp[i]);
|
||||
|
||||
// Let's just grab R000 R000, without swizzling yet.
|
||||
__m128i r = _mm_and_si128(_mm_srli_epi16(c, 8), mask4);
|
||||
// And then 00G0 00G0.
|
||||
__m128i g = _mm_and_si128(_mm_srli_epi16(c, 4), mask4);
|
||||
g = _mm_slli_epi16(g, 8);
|
||||
// Now B000 B000.
|
||||
__m128i b = _mm_and_si128(c, mask4);
|
||||
// And lastly 00A0 00A0. No mask needed, we have a wall.
|
||||
__m128i a = _mm_srli_epi16(c, 12);
|
||||
a = _mm_slli_epi16(g, 8);
|
||||
|
||||
// We swizzle after combining - R0G0 R0G0 and B0A0 B0A0 -> RRGG RRGG and BBAA BBAA.
|
||||
__m128i rg = _mm_or_si128(r, g);
|
||||
__m128i ba = _mm_or_si128(b, a);
|
||||
rg = _mm_or_si128(rg, _mm_slli_epi16(rg, 4));
|
||||
ba = _mm_or_si128(ba, _mm_slli_epi16(ba, 4));
|
||||
|
||||
// And then we can store.
|
||||
_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
|
||||
_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
|
||||
}
|
||||
u32 i = sseChunks * 8;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
|
||||
u8 *dst = (u8 *)dst32;
|
||||
for (u32 x = 0; x < numPixels; x++) {
|
||||
for (u32 x = i; x < numPixels; x++) {
|
||||
u16 col = src[x];
|
||||
dst[x * 4] = Convert4To8((col >> 8) & 0xf);
|
||||
dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf);
|
||||
@ -301,6 +435,7 @@ void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: This seems to be ABGR4444 -> RGBA888?
|
||||
void ConvertBGRA4444ToRGBA8888(u32 *dst32, const u16 *src, const u32 numPixels) {
|
||||
u8 *dst = (u8 *)dst32;
|
||||
for (u32 x = 0; x < numPixels; x++) {
|
||||
@ -332,4 +467,135 @@ void ConvertBGR565ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels) {
|
||||
u16 col0 = src[x];
|
||||
ARGB8From565(col0, &dst[x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskB = _mm_set1_epi16(0x00F0);
|
||||
const __m128i maskG = _mm_set1_epi16(0x0F00);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
u32 sseChunks = numPixels / 8;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
const __m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i v = _mm_srli_epi16(c, 12);
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), maskB));
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 4), maskG));
|
||||
v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
|
||||
_mm_store_si128(&dstp[i], v);
|
||||
}
|
||||
// The remainder is done in chunks of 2, SSE was chunks of 8.
|
||||
u32 i = sseChunks * 8 / 2;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
|
||||
const u32 *src32 = (const u32 *)src;
|
||||
u32 *dst32 = (u32 *)dst;
|
||||
for (; i < numPixels / 2; i++) {
|
||||
const u32 c = src32[i];
|
||||
dst32[i] = ((c >> 12) & 0x000F000F) |
|
||||
((c >> 4) & 0x00F000F0) |
|
||||
((c << 4) & 0x0F000F00) |
|
||||
((c << 12) & 0xF000F000);
|
||||
}
|
||||
|
||||
if (numPixels & 1) {
|
||||
const u32 i = numPixels - 1;
|
||||
const u16 c = src[i];
|
||||
dst[i] = ((c >> 12) & 0x000F) |
|
||||
((c >> 4) & 0x00F0) |
|
||||
((c << 4) & 0x0F00) |
|
||||
((c << 12) & 0xF000);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskB = _mm_set1_epi16(0x003E);
|
||||
const __m128i maskG = _mm_set1_epi16(0x07C0);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
u32 sseChunks = numPixels / 8;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
const __m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i v = _mm_srli_epi16(c, 15);
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
|
||||
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
|
||||
_mm_store_si128(&dstp[i], v);
|
||||
}
|
||||
// The remainder is done in chunks of 2, SSE was chunks of 8.
|
||||
u32 i = sseChunks * 8 / 2;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
|
||||
const u32 *src32 = (const u32 *)src;
|
||||
u32 *dst32 = (u32 *)dst;
|
||||
for (; i < numPixels / 2; i++) {
|
||||
const u32 c = src32[i];
|
||||
dst32[i] = ((c >> 15) & 0x00010001) |
|
||||
((c >> 9) & 0x003E003E) |
|
||||
((c << 1) & 0x07C007C0) |
|
||||
((c << 11) & 0xF800F800);
|
||||
}
|
||||
|
||||
if (numPixels & 1) {
|
||||
const u32 i = numPixels - 1;
|
||||
const u16 c = src[i];
|
||||
dst[i] = ((c >> 15) & 0x0001) |
|
||||
((c >> 9) & 0x003E) |
|
||||
((c << 1) & 0x07C0) |
|
||||
((c << 11) & 0xF800);
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskG = _mm_set1_epi16(0x07E0);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
u32 sseChunks = numPixels / 8;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
const __m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i v = _mm_srli_epi16(c, 11);
|
||||
v = _mm_or_si128(v, _mm_and_si128(c, maskG));
|
||||
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
|
||||
_mm_store_si128(&dstp[i], v);
|
||||
}
|
||||
// The remainder is done in chunks of 2, SSE was chunks of 8.
|
||||
u32 i = sseChunks * 8 / 2;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
|
||||
const u32 *src32 = (const u32 *)src;
|
||||
u32 *dst32 = (u32 *)dst;
|
||||
for (; i < numPixels / 2; i++) {
|
||||
const u32 c = src32[i];
|
||||
dst32[i] = ((c >> 11) & 0x001F001F) |
|
||||
((c >> 0) & 0x07E007E0) |
|
||||
((c << 11) & 0xF800F800);
|
||||
}
|
||||
|
||||
if (numPixels & 1) {
|
||||
const u32 i = numPixels - 1;
|
||||
const u16 c = src[i];
|
||||
dst[i] = ((c >> 11) & 0x001F) |
|
||||
((c >> 0) & 0x07E0) |
|
||||
((c << 11) & 0xF800);
|
||||
}
|
||||
}
|
||||
|
@ -105,6 +105,7 @@ void convert5551_dx9(u16* data, u32* out, int width, int l, int u);
|
||||
// "Complete" set of color conversion functions between the usual formats.
|
||||
|
||||
void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels);
|
||||
#define ConvertRGBA8888ToBGRA8888 ConvertBGRA8888ToRGBA8888
|
||||
|
||||
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels);
|
||||
void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, const u32 numPixels);
|
||||
@ -121,3 +122,7 @@ void ConvertRGBA4444ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
|
||||
void ConvertBGRA4444ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
|
||||
void ConvertBGRA5551ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
|
||||
void ConvertBGR565ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
|
||||
|
||||
void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, const u32 numPixels);
|
||||
void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, const u32 numPixels);
|
||||
void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, const u32 numPixels);
|
||||
|
23
Common/ColorConvNEON.cpp
Normal file
23
Common/ColorConvNEON.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
// Copyright (c) 2015- PPSSPP Project.
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, version 2.0 or later versions.
|
||||
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License 2.0 for more details.
|
||||
|
||||
// A copy of the GPL 2.0 should have been included with the program.
|
||||
// If not, see http://www.gnu.org/licenses/
|
||||
|
||||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "ColorConvNEON.h"
|
||||
#include "Common.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
// TODO: NEON color conversion funcs.
|
20
Common/ColorConvNEON.h
Normal file
20
Common/ColorConvNEON.h
Normal file
@ -0,0 +1,20 @@
|
||||
// Copyright (c) 2015- PPSSPP Project.
|
||||
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, version 2.0 or later versions.
|
||||
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License 2.0 for more details.
|
||||
|
||||
// A copy of the GPL 2.0 should have been included with the program.
|
||||
// If not, see http://www.gnu.org/licenses/
|
||||
|
||||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ColorConv.h"
|
@ -194,6 +194,7 @@
|
||||
<ClInclude Include="Atomic_GCC.h" />
|
||||
<ClInclude Include="Atomic_Win32.h" />
|
||||
<ClInclude Include="BitSet.h" />
|
||||
<ClInclude Include="ColorConvNEON.h" />
|
||||
<ClInclude Include="ChunkFile.h" />
|
||||
<ClInclude Include="CodeBlock.h" />
|
||||
<ClInclude Include="ColorConv.h" />
|
||||
@ -242,6 +243,12 @@
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ColorConvNEON.cpp">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ChunkFile.cpp" />
|
||||
<ClCompile Include="ColorConv.cpp" />
|
||||
<ClCompile Include="ConsoleListener.cpp" />
|
||||
|
@ -46,6 +46,7 @@
|
||||
<ClInclude Include="BitSet.h" />
|
||||
<ClInclude Include="CodeBlock.h" />
|
||||
<ClInclude Include="ColorConv.h" />
|
||||
<ClInclude Include="ColorConvNEON.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="stdafx.cpp" />
|
||||
@ -81,6 +82,7 @@
|
||||
<ClCompile Include="MipsEmitter.cpp" />
|
||||
<ClCompile Include="Arm64Emitter.cpp" />
|
||||
<ClCompile Include="ColorConv.cpp" />
|
||||
<ClCompile Include="ColorConvNEON.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<None Include="CMakeLists.txt" />
|
||||
|
@ -787,128 +787,18 @@ static void ConvertColors(void *dstBuf, const void *srcBuf, GLuint dstFmt, int n
|
||||
u32 *dst = (u32 *)dstBuf;
|
||||
switch (dstFmt) {
|
||||
case GL_UNSIGNED_SHORT_4_4_4_4:
|
||||
{
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskB = _mm_set1_epi16(0x00F0);
|
||||
const __m128i maskG = _mm_set1_epi16(0x0F00);
|
||||
|
||||
__m128i *srcp = (__m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
const int sseChunks = numPixels / 8;
|
||||
for (int i = 0; i < sseChunks; ++i) {
|
||||
__m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i v = _mm_srli_epi16(c, 12);
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), maskB));
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 4), maskG));
|
||||
v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
|
||||
_mm_store_si128(&dstp[i], v);
|
||||
}
|
||||
// The remainder is done in chunks of 2, SSE was chunks of 8.
|
||||
int i = sseChunks * 8 / 2;
|
||||
#else
|
||||
int i = 0;
|
||||
// TODO: NEON.
|
||||
#endif
|
||||
for (; i < (numPixels + 1) / 2; i++) {
|
||||
u32 c = src[i];
|
||||
dst[i] = ((c >> 12) & 0x000F000F) |
|
||||
((c >> 4) & 0x00F000F0) |
|
||||
((c << 4) & 0x0F000F00) |
|
||||
((c << 12) & 0xF000F000);
|
||||
}
|
||||
}
|
||||
ConvertRGBA4444ToABGR4444((u16 *)dst, (const u16 *)src, numPixels);
|
||||
break;
|
||||
// Final Fantasy 2 uses this heavily in animated textures.
|
||||
case GL_UNSIGNED_SHORT_5_5_5_1:
|
||||
{
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskB = _mm_set1_epi16(0x003E);
|
||||
const __m128i maskG = _mm_set1_epi16(0x07C0);
|
||||
|
||||
__m128i *srcp = (__m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
const int sseChunks = numPixels / 8;
|
||||
for (int i = 0; i < sseChunks; ++i) {
|
||||
__m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i v = _mm_srli_epi16(c, 15);
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
|
||||
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
|
||||
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
|
||||
_mm_store_si128(&dstp[i], v);
|
||||
}
|
||||
// The remainder is done in chunks of 2, SSE was chunks of 8.
|
||||
int i = sseChunks * 8 / 2;
|
||||
#else
|
||||
int i = 0;
|
||||
// TODO: NEON.
|
||||
#endif
|
||||
for (; i < (numPixels + 1) / 2; i++) {
|
||||
u32 c = src[i];
|
||||
dst[i] = ((c >> 15) & 0x00010001) |
|
||||
((c >> 9) & 0x003E003E) |
|
||||
((c << 1) & 0x07C007C0) |
|
||||
((c << 11) & 0xF800F800);
|
||||
}
|
||||
}
|
||||
ConvertRGBA5551ToABGR1555((u16 *)dst, (const u16 *)src, numPixels);
|
||||
break;
|
||||
case GL_UNSIGNED_SHORT_5_6_5:
|
||||
{
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskG = _mm_set1_epi16(0x07E0);
|
||||
|
||||
__m128i *srcp = (__m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
const int sseChunks = numPixels / 8;
|
||||
for (int i = 0; i < sseChunks; ++i) {
|
||||
__m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i v = _mm_srli_epi16(c, 11);
|
||||
v = _mm_or_si128(v, _mm_and_si128(c, maskG));
|
||||
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
|
||||
_mm_store_si128(&dstp[i], v);
|
||||
}
|
||||
// The remainder is done in chunks of 2, SSE was chunks of 8.
|
||||
int i = sseChunks * 8 / 2;
|
||||
#else
|
||||
int i = 0;
|
||||
// TODO: NEON.
|
||||
#endif
|
||||
for (; i < (numPixels + 1) / 2; i++) {
|
||||
u32 c = src[i];
|
||||
dst[i] = ((c >> 11) & 0x001F001F) |
|
||||
((c >> 0) & 0x07E007E0) |
|
||||
((c << 11) & 0xF800F800);
|
||||
}
|
||||
}
|
||||
ConvertRGB565ToBGR565((u16 *)dst, (const u16 *)src, numPixels);
|
||||
break;
|
||||
default:
|
||||
if (UseBGRA8888()) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
|
||||
|
||||
__m128i *srcp = (__m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
const int sseChunks = numPixels / 4;
|
||||
for (int i = 0; i < sseChunks; ++i) {
|
||||
__m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i rb = _mm_andnot_si128(maskGA, c);
|
||||
c = _mm_and_si128(c, maskGA);
|
||||
|
||||
__m128i b = _mm_srli_epi32(rb, 16);
|
||||
__m128i r = _mm_slli_epi32(rb, 16);
|
||||
c = _mm_or_si128(_mm_or_si128(c, r), b);
|
||||
_mm_store_si128(&dstp[i], c);
|
||||
}
|
||||
// The remainder starts right after those done via SSE.
|
||||
int i = sseChunks * 4;
|
||||
#else
|
||||
int i = 0;
|
||||
#endif
|
||||
for (; i < numPixels; i++) {
|
||||
u32 c = src[i];
|
||||
dst[i] = ((c >> 16) & 0x000000FF) |
|
||||
((c >> 0) & 0xFF00FF00) |
|
||||
((c << 16) & 0x00FF0000);
|
||||
}
|
||||
ConvertRGBA8888ToBGRA8888(dst, src, numPixels);
|
||||
} else {
|
||||
// No need to convert RGBA8888, right order already
|
||||
if (dst != src)
|
||||
|
@ -43,6 +43,8 @@ win32 {
|
||||
HEADERS += $$P/Common/MemArena.h
|
||||
}
|
||||
|
||||
armv7: SOURCES += $$P/Common/ColorConvNEON.cpp
|
||||
|
||||
SOURCES += $$P/Common/ChunkFile.cpp \
|
||||
$$P/Common/ColorConv.cpp \
|
||||
$$P/Common/ConsoleListener.cpp \
|
||||
|
@ -58,6 +58,7 @@ ARCH_FILES := \
|
||||
$(SRC)/Common/ArmEmitter.cpp \
|
||||
$(SRC)/Common/ArmCPUDetect.cpp \
|
||||
$(SRC)/Common/ArmThunk.cpp \
|
||||
$(SRC)/Common/ColorConvNEON.cpp.neon \
|
||||
$(SRC)/Core/MIPS/ARM/ArmCompALU.cpp \
|
||||
$(SRC)/Core/MIPS/ARM/ArmCompBranch.cpp \
|
||||
$(SRC)/Core/MIPS/ARM/ArmCompFPU.cpp \
|
||||
|
Loading…
Reference in New Issue
Block a user