mirror of
https://github.com/shadps4-emu/ext-SDL.git
synced 2025-01-07 10:50:21 +00:00
Implement mathematically correct scalar blitters
This commit is contained in:
parent
f0238ef615
commit
f5ee88c2cf
@ -546,112 +546,6 @@ static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
|
|
||||||
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
|
|
||||||
{
|
|
||||||
int width = info->dst_w;
|
|
||||||
int height = info->dst_h;
|
|
||||||
Uint32 *srcp = (Uint32 *)info->src;
|
|
||||||
int srcskip = info->src_skip >> 2;
|
|
||||||
Uint32 *dstp = (Uint32 *)info->dst;
|
|
||||||
int dstskip = info->dst_skip >> 2;
|
|
||||||
|
|
||||||
while (height--) {
|
|
||||||
/* *INDENT-OFF* */ /* clang-format off */
|
|
||||||
DUFFS_LOOP4({
|
|
||||||
Uint32 dalpha;
|
|
||||||
Uint32 d;
|
|
||||||
Uint32 s1;
|
|
||||||
Uint32 d1;
|
|
||||||
Uint32 s = *srcp;
|
|
||||||
Uint32 alpha = s >> 24;
|
|
||||||
/* FIXME: Here we special-case opaque alpha since the
|
|
||||||
compositioning used (>>8 instead of /255) doesn't handle
|
|
||||||
it correctly. Also special-case alpha=0 for speed?
|
|
||||||
Benchmark this! */
|
|
||||||
if (alpha) {
|
|
||||||
if (alpha == SDL_ALPHA_OPAQUE) {
|
|
||||||
*dstp = *srcp;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* take out the middle component (green), and process
|
|
||||||
* the other two in parallel. One multiply less.
|
|
||||||
*/
|
|
||||||
d = *dstp;
|
|
||||||
dalpha = d >> 24;
|
|
||||||
s1 = s & 0xff00ff;
|
|
||||||
d1 = d & 0xff00ff;
|
|
||||||
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
|
|
||||||
s &= 0xff00;
|
|
||||||
d &= 0xff00;
|
|
||||||
d = (d + ((s - d) * alpha >> 8)) & 0xff00;
|
|
||||||
dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
|
|
||||||
*dstp = d1 | d | (dalpha << 24);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
++srcp;
|
|
||||||
++dstp;
|
|
||||||
}, width);
|
|
||||||
/* *INDENT-ON* */ /* clang-format on */
|
|
||||||
srcp += srcskip;
|
|
||||||
dstp += dstskip;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* fast ARGB888->(A)BGR888 blending with pixel alpha */
|
|
||||||
static void BlitRGBtoBGRPixelAlpha(SDL_BlitInfo *info)
|
|
||||||
{
|
|
||||||
int width = info->dst_w;
|
|
||||||
int height = info->dst_h;
|
|
||||||
Uint32 *srcp = (Uint32 *)info->src;
|
|
||||||
int srcskip = info->src_skip >> 2;
|
|
||||||
Uint32 *dstp = (Uint32 *)info->dst;
|
|
||||||
int dstskip = info->dst_skip >> 2;
|
|
||||||
|
|
||||||
while (height--) {
|
|
||||||
/* *INDENT-OFF* */ /* clang-format off */
|
|
||||||
DUFFS_LOOP4({
|
|
||||||
Uint32 dalpha;
|
|
||||||
Uint32 d;
|
|
||||||
Uint32 s1;
|
|
||||||
Uint32 d1;
|
|
||||||
Uint32 s = *srcp;
|
|
||||||
Uint32 alpha = s >> 24;
|
|
||||||
/* FIXME: Here we special-case opaque alpha since the
|
|
||||||
compositioning used (>>8 instead of /255) doesn't handle
|
|
||||||
it correctly. Also special-case alpha=0 for speed?
|
|
||||||
Benchmark this! */
|
|
||||||
if (alpha) {
|
|
||||||
/*
|
|
||||||
* take out the middle component (green), and process
|
|
||||||
* the other two in parallel. One multiply less.
|
|
||||||
*/
|
|
||||||
s1 = s & 0xff00ff;
|
|
||||||
s1 = (s1 >> 16) | (s1 << 16);
|
|
||||||
s &= 0xff00;
|
|
||||||
|
|
||||||
if (alpha == SDL_ALPHA_OPAQUE) {
|
|
||||||
*dstp = 0xff000000 | s | s1;
|
|
||||||
} else {
|
|
||||||
d = *dstp;
|
|
||||||
dalpha = d >> 24;
|
|
||||||
d1 = d & 0xff00ff;
|
|
||||||
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
|
|
||||||
d &= 0xff00;
|
|
||||||
d = (d + ((s - d) * alpha >> 8)) & 0xff00;
|
|
||||||
dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
|
|
||||||
*dstp = d1 | d | (dalpha << 24);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
++srcp;
|
|
||||||
++dstp;
|
|
||||||
}, width);
|
|
||||||
/* *INDENT-ON* */ /* clang-format on */
|
|
||||||
srcp += srcskip;
|
|
||||||
dstp += dstskip;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
|
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
|
||||||
|
|
||||||
/* blend a single 16 bit pixel at 50% */
|
/* blend a single 16 bit pixel at 50% */
|
||||||
@ -1285,6 +1179,15 @@ static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Accurate alpha blending with no division */
|
||||||
|
static Uint8 AlphaBlendChannel(Uint8 sC, Uint8 dC, Uint8 sA)
|
||||||
|
{
|
||||||
|
Uint16 x = ((sC - dC) * sA) + ((dC << 8) - dC);
|
||||||
|
x += 0x1U; // Use 0x80 to round instead of floor
|
||||||
|
x += x >> 8;
|
||||||
|
return x >> 8;
|
||||||
|
}
|
||||||
|
|
||||||
/* General (slow) N->N blending with pixel alpha */
|
/* General (slow) N->N blending with pixel alpha */
|
||||||
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
||||||
{
|
{
|
||||||
@ -1298,6 +1201,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
|||||||
SDL_PixelFormat *dstfmt = info->dst_fmt;
|
SDL_PixelFormat *dstfmt = info->dst_fmt;
|
||||||
int srcbpp;
|
int srcbpp;
|
||||||
int dstbpp;
|
int dstbpp;
|
||||||
|
int freeFormat;
|
||||||
Uint32 Pixel;
|
Uint32 Pixel;
|
||||||
unsigned sR, sG, sB, sA;
|
unsigned sR, sG, sB, sA;
|
||||||
unsigned dR, dG, dB, dA;
|
unsigned dR, dG, dB, dA;
|
||||||
@ -1305,6 +1209,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
|||||||
/* Set up some basic variables */
|
/* Set up some basic variables */
|
||||||
srcbpp = srcfmt->bytes_per_pixel;
|
srcbpp = srcfmt->bytes_per_pixel;
|
||||||
dstbpp = dstfmt->bytes_per_pixel;
|
dstbpp = dstfmt->bytes_per_pixel;
|
||||||
|
freeFormat = 0;
|
||||||
|
|
||||||
#ifdef SDL_AVX2_INTRINSICS
|
#ifdef SDL_AVX2_INTRINSICS
|
||||||
if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) {
|
if (srcbpp == 4 && dstbpp == 4 && width >= 4 && SDL_HasAVX2()) {
|
||||||
@ -1319,6 +1224,11 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
/* Handle case where bad input sent */
|
||||||
|
if (dstfmt->Ashift == 0 && dstfmt->Ashift == dstfmt->Bshift) {
|
||||||
|
dstfmt = SDL_CreatePixelFormat(SDL_PIXELFORMAT_ARGB8888);
|
||||||
|
freeFormat = 1;
|
||||||
|
}
|
||||||
|
|
||||||
while (height--) {
|
while (height--) {
|
||||||
/* *INDENT-OFF* */ /* clang-format off */
|
/* *INDENT-OFF* */ /* clang-format off */
|
||||||
@ -1327,7 +1237,10 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
|||||||
DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
|
DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
|
||||||
if (sA) {
|
if (sA) {
|
||||||
DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
|
DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
|
||||||
ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
|
dR = AlphaBlendChannel(sR, dR, sA);
|
||||||
|
dG = AlphaBlendChannel(sG, dG, sA);
|
||||||
|
dB = AlphaBlendChannel(sB, dB, sA);
|
||||||
|
dA = AlphaBlendChannel(255, dA, sA);
|
||||||
ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
|
ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
|
||||||
}
|
}
|
||||||
src += srcbpp;
|
src += srcbpp;
|
||||||
@ -1338,6 +1251,9 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
|
|||||||
src += srcskip;
|
src += srcskip;
|
||||||
dst += dstskip;
|
dst += dstskip;
|
||||||
}
|
}
|
||||||
|
if (freeFormat) {
|
||||||
|
SDL_DestroyPixelFormat(dstfmt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
|
SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
|
||||||
@ -1406,11 +1322,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
|
|||||||
return BlitRGBtoRGBPixelAlphaARMSIMD;
|
return BlitRGBtoRGBPixelAlphaARMSIMD;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return BlitRGBtoRGBPixelAlpha;
|
|
||||||
}
|
|
||||||
} else if (sf->Rmask == df->Bmask && sf->Gmask == df->Gmask && sf->Bmask == df->Rmask && sf->bytes_per_pixel == 4) {
|
|
||||||
if (sf->Amask == 0xff000000) {
|
|
||||||
return BlitRGBtoBGRPixelAlpha;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return BlitNtoNPixelAlpha;
|
return BlitNtoNPixelAlpha;
|
||||||
|
@ -89,9 +89,13 @@ __m256i SDL_TARGETING("avx2") MixRGBA_AVX2(__m256i src, __m256i dst, const __m25
|
|||||||
dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
|
dst_hi = _mm256_add_epi16(_mm256_mullo_epi16(_mm256_sub_epi16(src_hi, dst_hi), srca_hi),
|
||||||
_mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
|
_mm256_sub_epi16(_mm256_slli_epi16(dst_hi, 8), dst_hi));
|
||||||
|
|
||||||
// dst = (dst * 0x8081) >> 23
|
// dst += 0x1U (use 0x80 to round instead of floor)
|
||||||
dst_lo = _mm256_srli_epi16(_mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(-0x7F7F)), 7);
|
dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1));
|
||||||
dst_hi = _mm256_srli_epi16(_mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(-0x7F7F)), 7);
|
dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1));
|
||||||
|
|
||||||
|
// dst += dst >> 8
|
||||||
|
dst_lo = _mm256_srli_epi16(_mm256_add_epi16(dst_lo, _mm256_srli_epi16(dst_lo, 8)), 8);
|
||||||
|
dst_hi = _mm256_srli_epi16(_mm256_add_epi16(dst_hi, _mm256_srli_epi16(dst_hi, 8)), 8);
|
||||||
|
|
||||||
dst = _mm256_packus_epi16(dst_lo, dst_hi);
|
dst = _mm256_packus_epi16(dst_lo, dst_hi);
|
||||||
return dst;
|
return dst;
|
||||||
|
@ -90,9 +90,13 @@ __m128i SDL_TARGETING("sse4.1") MixRGBA_SSE4_1(__m128i src, __m128i dst,
|
|||||||
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
|
dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srca_hi),
|
||||||
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
|
_mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
|
||||||
|
|
||||||
// dst = (dst * 0x8081) >> 23
|
// dst += 0x1U (use 0x80 to round instead of floor)
|
||||||
dst_lo = _mm_srli_epi16(_mm_mulhi_epu16(dst_lo, _mm_set1_epi16(-0x7F7F)), 7);
|
dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
|
||||||
dst_hi = _mm_srli_epi16(_mm_mulhi_epu16(dst_hi, _mm_set1_epi16(-0x7F7F)), 7);
|
dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
|
||||||
|
|
||||||
|
// dst += dst >> 8;
|
||||||
|
dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
|
||||||
|
dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
|
||||||
|
|
||||||
dst = _mm_packus_epi16(dst_lo, dst_hi);
|
dst = _mm_packus_epi16(dst_lo, dst_hi);
|
||||||
return dst;
|
return dst;
|
||||||
|
@ -98,8 +98,7 @@ Uint32 hashSurfacePixels(SDL_Surface * surface) {
|
|||||||
int blit_testExampleApplicationRender(void *arg) {
|
int blit_testExampleApplicationRender(void *arg) {
|
||||||
const int width = 32;
|
const int width = 32;
|
||||||
const int height = 32;
|
const int height = 32;
|
||||||
const unsigned long scalar_hash = 0xf47a3f55;
|
const unsigned long correct_hash = 0xe345d7a7;
|
||||||
const unsigned long x86_simd_hash = 0xe345d7a7;
|
|
||||||
SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ARGB8888);
|
SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ARGB8888);
|
||||||
SDL_Surface* rainbow_background = SDLTest_ImageBlendingBackground();
|
SDL_Surface* rainbow_background = SDLTest_ImageBlendingBackground();
|
||||||
SDL_Surface* gearbrain_sprite = SDLTest_ImageBlendingSprite();
|
SDL_Surface* gearbrain_sprite = SDLTest_ImageBlendingSprite();
|
||||||
@ -109,9 +108,8 @@ int blit_testExampleApplicationRender(void *arg) {
|
|||||||
SDL_BlitSurface(gearbrain_sprite, NULL, dest_surface, NULL);
|
SDL_BlitSurface(gearbrain_sprite, NULL, dest_surface, NULL);
|
||||||
// Check result
|
// Check result
|
||||||
const unsigned long hash = hashSurfacePixels(dest_surface);
|
const unsigned long hash = hashSurfacePixels(dest_surface);
|
||||||
SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash,
|
SDLTest_AssertCheck(hash == correct_hash,
|
||||||
"Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx",
|
"Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash);
|
||||||
scalar_hash, x86_simd_hash, hash);
|
|
||||||
// Clean up
|
// Clean up
|
||||||
SDL_DestroySurface(rainbow_background);
|
SDL_DestroySurface(rainbow_background);
|
||||||
SDL_DestroySurface(gearbrain_sprite);
|
SDL_DestroySurface(gearbrain_sprite);
|
||||||
@ -126,8 +124,7 @@ int blit_testExampleApplicationRender(void *arg) {
|
|||||||
int blit_testRandomToRandomSVGA(void *arg) {
|
int blit_testRandomToRandomSVGA(void *arg) {
|
||||||
const int width = 800;
|
const int width = 800;
|
||||||
const int height = 600;
|
const int height = 600;
|
||||||
const unsigned long scalar_hash = 0x1f56efad;
|
const unsigned long correct_hash = 0x42140c5f;
|
||||||
const unsigned long x86_simd_hash = 0x42140c5f;
|
|
||||||
// Allocate random buffers
|
// Allocate random buffers
|
||||||
Uint32 *dest_pixels = getNextRandomBuffer(width, height);
|
Uint32 *dest_pixels = getNextRandomBuffer(width, height);
|
||||||
Uint32 *src_pixels = getNextRandomBuffer(width, height);
|
Uint32 *src_pixels = getNextRandomBuffer(width, height);
|
||||||
@ -138,9 +135,8 @@ int blit_testRandomToRandomSVGA(void *arg) {
|
|||||||
SDL_BlitSurface(src_surface, NULL, dest_surface, NULL);
|
SDL_BlitSurface(src_surface, NULL, dest_surface, NULL);
|
||||||
// Check result
|
// Check result
|
||||||
const unsigned long hash = hashSurfacePixels(dest_surface);
|
const unsigned long hash = hashSurfacePixels(dest_surface);
|
||||||
SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash,
|
SDLTest_AssertCheck(hash == correct_hash,
|
||||||
"Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx",
|
"Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash);
|
||||||
scalar_hash, x86_simd_hash, hash);
|
|
||||||
// Clean up
|
// Clean up
|
||||||
SDL_DestroySurface(dest_surface);
|
SDL_DestroySurface(dest_surface);
|
||||||
SDL_DestroySurface(src_surface);
|
SDL_DestroySurface(src_surface);
|
||||||
@ -157,8 +153,7 @@ int blit_testRandomToRandomSVGAMultipleIterations(void *arg) {
|
|||||||
const int width = 800;
|
const int width = 800;
|
||||||
const int height = 600;
|
const int height = 600;
|
||||||
int i;
|
int i;
|
||||||
const unsigned long x86_simd_hash = 0x2626be78;
|
const unsigned long correct_hash = 0x5d26be78;
|
||||||
const unsigned long scalar_hash = 0xfb2a8ee8;
|
|
||||||
// Create blank source surface
|
// Create blank source surface
|
||||||
SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ABGR8888);
|
SDL_Surface* dest_surface = SDL_CreateSurface(width, height, SDL_PIXELFORMAT_ABGR8888);
|
||||||
|
|
||||||
@ -180,11 +175,9 @@ int blit_testRandomToRandomSVGAMultipleIterations(void *arg) {
|
|||||||
// Check result
|
// Check result
|
||||||
const unsigned long hash = hashSurfacePixels(dest_surface);
|
const unsigned long hash = hashSurfacePixels(dest_surface);
|
||||||
// Clean up
|
// Clean up
|
||||||
SDL_SaveBMP(dest_surface, "250k_scalar.bmp");
|
|
||||||
SDL_DestroySurface(dest_surface);
|
SDL_DestroySurface(dest_surface);
|
||||||
SDLTest_AssertCheck(hash == scalar_hash || hash == x86_simd_hash,
|
SDLTest_AssertCheck(hash == correct_hash,
|
||||||
"Should render identically, expected 0x%lx (scalar) or 0x%lx (x86_simd), got 0x%lx",
|
"Should render identically, expected hash 0x%lx, got 0x%lx", correct_hash, hash);
|
||||||
scalar_hash, x86_simd_hash, hash);
|
|
||||||
return TEST_COMPLETED;
|
return TEST_COMPLETED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user