Move color conversion funcs to ColorConv.

This paves the way a bit for NEON conversion funcs.
This commit is contained in:
Unknown W. Brackets 2015-05-17 13:45:30 -07:00
parent 3f29329ed2
commit 1767bd958c
3 changed files with 141 additions and 115 deletions

View File

@ -336,4 +336,135 @@ void ConvertBGR565ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels) {
u16 col0 = src[x];
ARGB8From565(col0, &dst[x]);
}
}
}
void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, const u32 numPixels) {
#ifdef _M_SSE
const __m128i maskB = _mm_set1_epi16(0x00F0);
const __m128i maskG = _mm_set1_epi16(0x0F00);
const __m128i *srcp = (const __m128i *)src;
__m128i *dstp = (__m128i *)dst;
u32 sseChunks = numPixels / 8;
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
sseChunks = 0;
}
for (u32 i = 0; i < sseChunks; ++i) {
const __m128i c = _mm_load_si128(&srcp[i]);
__m128i v = _mm_srli_epi16(c, 12);
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), maskB));
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 4), maskG));
v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
_mm_store_si128(&dstp[i], v);
}
// The remainder is done in chunks of 2, SSE was chunks of 8.
u32 i = sseChunks * 8 / 2;
#else
u32 i = 0;
#endif
const u32 *src32 = (const u32 *)src;
u32 *dst32 = (u32 *)dst;
for (; i < numPixels / 2; i++) {
const u32 c = src32[i];
dst32[i] = ((c >> 12) & 0x000F000F) |
((c >> 4) & 0x00F000F0) |
((c << 4) & 0x0F000F00) |
((c << 12) & 0xF000F000);
}
if (numPixels & 1) {
const u32 i = numPixels - 1;
const u16 c = src[i];
dst[i] = ((c >> 12) & 0x000F) |
((c >> 4) & 0x00F0) |
((c << 4) & 0x0F00) |
((c << 12) & 0xF000);
}
}
void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, const u32 numPixels) {
#ifdef _M_SSE
const __m128i maskB = _mm_set1_epi16(0x003E);
const __m128i maskG = _mm_set1_epi16(0x07C0);
const __m128i *srcp = (const __m128i *)src;
__m128i *dstp = (__m128i *)dst;
u32 sseChunks = numPixels / 8;
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
sseChunks = 0;
}
for (u32 i = 0; i < sseChunks; ++i) {
const __m128i c = _mm_load_si128(&srcp[i]);
__m128i v = _mm_srli_epi16(c, 15);
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
_mm_store_si128(&dstp[i], v);
}
// The remainder is done in chunks of 2, SSE was chunks of 8.
u32 i = sseChunks * 8 / 2;
#else
u32 i = 0;
#endif
const u32 *src32 = (const u32 *)src;
u32 *dst32 = (u32 *)dst;
for (; i < numPixels / 2; i++) {
const u32 c = src32[i];
dst32[i] = ((c >> 15) & 0x00010001) |
((c >> 9) & 0x003E003E) |
((c << 1) & 0x07C007C0) |
((c << 11) & 0xF800F800);
}
if (numPixels & 1) {
const u32 i = numPixels - 1;
const u16 c = src[i];
dst[i] = ((c >> 15) & 0x0001) |
((c >> 9) & 0x003E) |
((c << 1) & 0x07C0) |
((c << 11) & 0xF800);
}
}
void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, const u32 numPixels) {
#ifdef _M_SSE
const __m128i maskG = _mm_set1_epi16(0x07E0);
const __m128i *srcp = (const __m128i *)src;
__m128i *dstp = (__m128i *)dst;
u32 sseChunks = numPixels / 8;
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
sseChunks = 0;
}
for (u32 i = 0; i < sseChunks; ++i) {
const __m128i c = _mm_load_si128(&srcp[i]);
__m128i v = _mm_srli_epi16(c, 11);
v = _mm_or_si128(v, _mm_and_si128(c, maskG));
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
_mm_store_si128(&dstp[i], v);
}
// The remainder is done in chunks of 2, SSE was chunks of 8.
u32 i = sseChunks * 8 / 2;
#else
u32 i = 0;
#endif
const u32 *src32 = (const u32 *)src;
u32 *dst32 = (u32 *)dst;
for (; i < numPixels / 2; i++) {
const u32 c = src32[i];
dst32[i] = ((c >> 11) & 0x001F001F) |
((c >> 0) & 0x07E007E0) |
((c << 11) & 0xF800F800);
}
if (numPixels & 1) {
const u32 i = numPixels - 1;
const u16 c = src[i];
dst[i] = ((c >> 11) & 0x001F) |
((c >> 0) & 0x07E0) |
((c << 11) & 0xF800);
}
}

View File

@ -105,6 +105,7 @@ void convert5551_dx9(u16* data, u32* out, int width, int l, int u);
// "Complete" set of color conversion functions between the usual formats.
void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels);
#define ConvertRGBA8888ToBGRA8888 ConvertBGRA8888ToRGBA8888
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels);
void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, const u32 numPixels);
@ -121,3 +122,7 @@ void ConvertRGBA4444ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
void ConvertBGRA4444ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
void ConvertBGRA5551ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
void ConvertBGR565ToRGBA8888(u32 *dst, const u16 *src, const u32 numPixels);
void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, const u32 numPixels);
void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, const u32 numPixels);
void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, const u32 numPixels);

View File

@ -787,128 +787,18 @@ static void ConvertColors(void *dstBuf, const void *srcBuf, GLuint dstFmt, int n
u32 *dst = (u32 *)dstBuf;
switch (dstFmt) {
case GL_UNSIGNED_SHORT_4_4_4_4:
{
#ifdef _M_SSE
const __m128i maskB = _mm_set1_epi16(0x00F0);
const __m128i maskG = _mm_set1_epi16(0x0F00);
__m128i *srcp = (__m128i *)src;
__m128i *dstp = (__m128i *)dst;
const int sseChunks = numPixels / 8;
for (int i = 0; i < sseChunks; ++i) {
__m128i c = _mm_load_si128(&srcp[i]);
__m128i v = _mm_srli_epi16(c, 12);
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), maskB));
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 4), maskG));
v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
_mm_store_si128(&dstp[i], v);
}
// The remainder is done in chunks of 2, SSE was chunks of 8.
int i = sseChunks * 8 / 2;
#else
int i = 0;
// TODO: NEON.
#endif
for (; i < (numPixels + 1) / 2; i++) {
u32 c = src[i];
dst[i] = ((c >> 12) & 0x000F000F) |
((c >> 4) & 0x00F000F0) |
((c << 4) & 0x0F000F00) |
((c << 12) & 0xF000F000);
}
}
ConvertRGBA4444ToABGR4444((u16 *)dst, (const u16 *)src, numPixels);
break;
// Final Fantasy 2 uses this heavily in animated textures.
case GL_UNSIGNED_SHORT_5_5_5_1:
{
#ifdef _M_SSE
const __m128i maskB = _mm_set1_epi16(0x003E);
const __m128i maskG = _mm_set1_epi16(0x07C0);
__m128i *srcp = (__m128i *)src;
__m128i *dstp = (__m128i *)dst;
const int sseChunks = numPixels / 8;
for (int i = 0; i < sseChunks; ++i) {
__m128i c = _mm_load_si128(&srcp[i]);
__m128i v = _mm_srli_epi16(c, 15);
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
_mm_store_si128(&dstp[i], v);
}
// The remainder is done in chunks of 2, SSE was chunks of 8.
int i = sseChunks * 8 / 2;
#else
int i = 0;
// TODO: NEON.
#endif
for (; i < (numPixels + 1) / 2; i++) {
u32 c = src[i];
dst[i] = ((c >> 15) & 0x00010001) |
((c >> 9) & 0x003E003E) |
((c << 1) & 0x07C007C0) |
((c << 11) & 0xF800F800);
}
}
ConvertRGBA5551ToABGR1555((u16 *)dst, (const u16 *)src, numPixels);
break;
case GL_UNSIGNED_SHORT_5_6_5:
{
#ifdef _M_SSE
const __m128i maskG = _mm_set1_epi16(0x07E0);
__m128i *srcp = (__m128i *)src;
__m128i *dstp = (__m128i *)dst;
const int sseChunks = numPixels / 8;
for (int i = 0; i < sseChunks; ++i) {
__m128i c = _mm_load_si128(&srcp[i]);
__m128i v = _mm_srli_epi16(c, 11);
v = _mm_or_si128(v, _mm_and_si128(c, maskG));
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
_mm_store_si128(&dstp[i], v);
}
// The remainder is done in chunks of 2, SSE was chunks of 8.
int i = sseChunks * 8 / 2;
#else
int i = 0;
// TODO: NEON.
#endif
for (; i < (numPixels + 1) / 2; i++) {
u32 c = src[i];
dst[i] = ((c >> 11) & 0x001F001F) |
((c >> 0) & 0x07E007E0) |
((c << 11) & 0xF800F800);
}
}
ConvertRGB565ToBGR565((u16 *)dst, (const u16 *)src, numPixels);
break;
default:
if (UseBGRA8888()) {
#ifdef _M_SSE
const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
__m128i *srcp = (__m128i *)src;
__m128i *dstp = (__m128i *)dst;
const int sseChunks = numPixels / 4;
for (int i = 0; i < sseChunks; ++i) {
__m128i c = _mm_load_si128(&srcp[i]);
__m128i rb = _mm_andnot_si128(maskGA, c);
c = _mm_and_si128(c, maskGA);
__m128i b = _mm_srli_epi32(rb, 16);
__m128i r = _mm_slli_epi32(rb, 16);
c = _mm_or_si128(_mm_or_si128(c, r), b);
_mm_store_si128(&dstp[i], c);
}
// The remainder starts right after those done via SSE.
int i = sseChunks * 4;
#else
int i = 0;
#endif
for (; i < numPixels; i++) {
u32 c = src[i];
dst[i] = ((c >> 16) & 0x000000FF) |
((c >> 0) & 0xFF00FF00) |
((c << 16) & 0x00FF0000);
}
ConvertRGBA8888ToBGRA8888(dst, src, numPixels);
} else {
// No need to convert RGBA8888, right order already
if (dst != src)