mirror of
https://github.com/libretro/scummvm.git
synced 2025-01-09 03:10:22 +00:00
GRAPHICS: Optimize alpha blend NEON and Generic
I optimized the NEON and Generic paths for ManagedSurface::blendBlitFrom and the new TransparentSurface::blit. Now (on arm), the new blit function matches the speed of the old blit function even with the added inderections that the runtime extension detection code adds in. Other than that, I made a benchmark for this code and you can make it using this command: CFLAGS="-DTEST_BLEND_SPEED" make test I reverted wii to not use altivec anymore since it doesn't. I also removed graphics/blit-neon.cpp from graphics/module.mk because simply including the .cpp file in graphics/blit-alpha.cpp was a better option because then I didn't need to instantiate every version of the templates that I needed.
This commit is contained in:
parent
402c67064d
commit
eebadf4495
@ -176,8 +176,7 @@ bool OSystem_Wii::hasFeature(Feature f) {
|
||||
return (f == kFeatureFullscreenMode) ||
|
||||
(f == kFeatureAspectRatioCorrection) ||
|
||||
(f == kFeatureCursorPalette) ||
|
||||
(f == kFeatureOverlaySupportsAlpha) ||
|
||||
(f == kFeatureAltivec);
|
||||
(f == kFeatureOverlaySupportsAlpha));
|
||||
}
|
||||
|
||||
void OSystem_Wii::setFeatureState(Feature f, bool enable) {
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include "graphics/pixelformat.h"
|
||||
#include "common/system.h"
|
||||
|
||||
#include "graphics/blit-neon.cpp"
|
||||
|
||||
namespace Graphics {
|
||||
|
||||
namespace {
|
||||
@ -203,7 +205,7 @@ BlendBlit::Args::Args(byte *dst, const byte *src,
|
||||
/**
|
||||
* Optimized version of doBlit to be used with multiply blended blitting
|
||||
*/
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod, bool alphamod>
|
||||
void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
|
||||
const byte *in;
|
||||
byte *out;
|
||||
@ -211,10 +213,10 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
|
||||
int scaleXCtr, scaleYCtr = 0;
|
||||
const byte *inBase;
|
||||
|
||||
byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
byte cr = args.rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
byte cg = args.rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
byte cb = args.rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
const byte cr = rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
const byte cg = rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
const byte cb = rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
|
||||
for (uint32 i = 0; i < args.height; i++) {
|
||||
if (doscale) {
|
||||
@ -266,7 +268,7 @@ void BlendBlit::doBlitMultiplyBlendLogicGeneric(Args &args) {
|
||||
|
||||
}
|
||||
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod, bool alphamod>
|
||||
void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
|
||||
const byte *in;
|
||||
byte *out;
|
||||
@ -274,10 +276,10 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
|
||||
int scaleXCtr, scaleYCtr = 0;
|
||||
const byte *inBase;
|
||||
|
||||
byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
byte cr = args.rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
byte cg = args.rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
byte cb = args.rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
const byte cr = rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
const byte cg = rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
const byte cb = rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
|
||||
for (uint32 i = 0; i < args.height; i++) {
|
||||
if (doscale) {
|
||||
@ -294,15 +296,37 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
|
||||
|
||||
uint32 ina = in[kAIndex] * ca >> 8;
|
||||
|
||||
if (ina != 0) {
|
||||
uint outb = (out[kBIndex] * (255 - ina) >> 8);
|
||||
uint outg = (out[kGIndex] * (255 - ina) >> 8);
|
||||
uint outr = (out[kRIndex] * (255 - ina) >> 8);
|
||||
if (rgbmod) {
|
||||
if (ina != 0) {
|
||||
const uint outb = (out[kBIndex] * (255 - ina) >> 8);
|
||||
const uint outg = (out[kGIndex] * (255 - ina) >> 8);
|
||||
const uint outr = (out[kRIndex] * (255 - ina) >> 8);
|
||||
|
||||
out[kAIndex] = 255;
|
||||
out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
|
||||
out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
|
||||
out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
|
||||
out[kAIndex] = 255;
|
||||
out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
|
||||
out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
|
||||
out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
|
||||
}
|
||||
} else {
|
||||
if (ina != 0) {
|
||||
// Runs faster on newer hardware (doesn't do single byte manip)
|
||||
const uint32 in32 = *(const uint32 *)in;
|
||||
const uint32 out32 = *(const uint32 *)out;
|
||||
const uint32 rb = (in32 & (kRModMask | kBModMask)) >> 8;
|
||||
const uint32 g = in32 & kGModMask;
|
||||
const uint32 dstrb = (out32 & (kRModMask | kBModMask)) >> 8;
|
||||
const uint32 dstg = out32 & kGModMask;
|
||||
*(uint32 *)out = kAModMask |
|
||||
((dstrb * (255 - ina) + rb * ina) & (kRModMask | kBModMask)) |
|
||||
((dstg * (255 - ina) + g * ina) >> 8);
|
||||
|
||||
// I think this code will run faster on older hardware
|
||||
// TODO maybe?: Put #ifdef to use on older hardware
|
||||
//out[kAIndex] = 255;
|
||||
//out[kBIndex] = (out[kBIndex] * (255 - ina) + in[kBIndex] * ina) >> 8;
|
||||
//out[kGIndex] = (out[kGIndex] * (255 - ina) + in[kGIndex] * ina) >> 8;
|
||||
//out[kRIndex] = (out[kRIndex] * (255 - ina) + in[kRIndex] * ina) >> 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (doscale)
|
||||
@ -323,7 +347,7 @@ void BlendBlit::doBlitAlphaBlendLogicGeneric(Args &args) {
|
||||
/**
|
||||
* Optimized version of doBlit to be used with subtractive blended blitting
|
||||
*/
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod>
|
||||
void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
|
||||
const byte *in;
|
||||
byte *out;
|
||||
@ -331,9 +355,9 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
|
||||
int scaleXCtr, scaleYCtr = 0;
|
||||
const byte *inBase;
|
||||
|
||||
byte cr = args.rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
byte cg = args.rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
byte cb = args.rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
const byte cr = rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
const byte cg = rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
const byte cb = rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
|
||||
for (uint32 i = 0; i < args.height; i++) {
|
||||
if (doscale) {
|
||||
@ -384,7 +408,7 @@ void BlendBlit::doBlitSubtractiveBlendLogicGeneric(Args &args) {
|
||||
/**
|
||||
* Optimized version of doBlit to be used with additive blended blitting
|
||||
*/
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod, bool alphamod>
|
||||
void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
|
||||
const byte *in;
|
||||
byte *out;
|
||||
@ -392,10 +416,10 @@ void BlendBlit::doBlitAdditiveBlendLogicGeneric(Args &args) {
|
||||
int scaleXCtr, scaleYCtr = 0;
|
||||
const byte *inBase;
|
||||
|
||||
byte ca = args.alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
byte cr = args.rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
byte cg = args.rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
byte cb = args.rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
const byte cr = rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
const byte cg = rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
const byte cb = rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
|
||||
for (uint32 i = 0; i < args.height; i++) {
|
||||
if (doscale) {
|
||||
@ -552,12 +576,12 @@ void BlendBlit::blit(byte *dst, const byte *src,
|
||||
const AlphaType alphaType) {
|
||||
if (width == 0 || height == 0) return;
|
||||
if (!blitFunc) {
|
||||
// Get the correct blit function
|
||||
// Get the correct blit function
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
|
||||
else blitFunc = blitGeneric;
|
||||
if (g_system->hasFeature(OSystem::kFeatureNEON)) blitFunc = blitNEON;
|
||||
else blitFunc = blitGeneric;
|
||||
#else
|
||||
blitFunc = blitGeneric;
|
||||
blitFunc = blitGeneric;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -565,8 +589,11 @@ void BlendBlit::blit(byte *dst, const byte *src,
|
||||
blitFunc(args, blendMode, alphaType);
|
||||
}
|
||||
|
||||
// Let me know if there is a way to do function pointer to templated functions
|
||||
#define BLIT_FUNC(ext) \
|
||||
void BlendBlit::blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) { \
|
||||
bool rgbmod = ((args.color & kRGBModMask) != kRGBModMask); \
|
||||
bool alphamod = ((args.color & kAModMask) != kAModMask); \
|
||||
if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) { \
|
||||
if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) { \
|
||||
doBlitOpaqueBlendLogic##ext<false>(args); \
|
||||
@ -574,14 +601,54 @@ void BlendBlit::blit(byte *dst, const byte *src,
|
||||
doBlitBinaryBlendLogic##ext<false>(args); \
|
||||
} else { \
|
||||
if (blendMode == BLEND_ADDITIVE) { \
|
||||
doBlitAdditiveBlendLogic##ext<false>(args); \
|
||||
if (rgbmod) { \
|
||||
if (alphamod) { \
|
||||
doBlitAdditiveBlendLogic##ext<false, true, true>(args); \
|
||||
} else { \
|
||||
doBlitAdditiveBlendLogic##ext<false, true, false>(args); \
|
||||
} \
|
||||
} else { \
|
||||
if (alphamod) { \
|
||||
doBlitAdditiveBlendLogic##ext<false, false, true>(args); \
|
||||
} else { \
|
||||
doBlitAdditiveBlendLogic##ext<false, false, false>(args); \
|
||||
} \
|
||||
} \
|
||||
} else if (blendMode == BLEND_SUBTRACTIVE) { \
|
||||
doBlitSubtractiveBlendLogic##ext<false>(args); \
|
||||
if (rgbmod) { \
|
||||
doBlitSubtractiveBlendLogic##ext<false, true>(args); \
|
||||
} else { \
|
||||
doBlitSubtractiveBlendLogic##ext<false, false>(args); \
|
||||
} \
|
||||
} else if (blendMode == BLEND_MULTIPLY) { \
|
||||
doBlitMultiplyBlendLogic##ext<false>(args); \
|
||||
if (rgbmod) { \
|
||||
if (alphamod) { \
|
||||
doBlitMultiplyBlendLogic##ext<false, true, true>(args); \
|
||||
} else { \
|
||||
doBlitMultiplyBlendLogic##ext<false, true, false>(args); \
|
||||
} \
|
||||
} else { \
|
||||
if (alphamod) { \
|
||||
doBlitMultiplyBlendLogic##ext<false, false, true>(args); \
|
||||
} else { \
|
||||
doBlitMultiplyBlendLogic##ext<false, false, false>(args); \
|
||||
} \
|
||||
} \
|
||||
} else { \
|
||||
assert(blendMode == BLEND_NORMAL); \
|
||||
doBlitAlphaBlendLogic##ext<false>(args); \
|
||||
if (rgbmod) { \
|
||||
if (alphamod) { \
|
||||
doBlitAlphaBlendLogic##ext<false, true, true>(args); \
|
||||
} else { \
|
||||
doBlitAlphaBlendLogic##ext<false, true, false>(args); \
|
||||
} \
|
||||
} else { \
|
||||
if (alphamod) { \
|
||||
doBlitAlphaBlendLogic##ext<false, false, true>(args); \
|
||||
} else { \
|
||||
doBlitAlphaBlendLogic##ext<false, false, false>(args); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} else { \
|
||||
@ -591,14 +658,54 @@ void BlendBlit::blit(byte *dst, const byte *src,
|
||||
doBlitBinaryBlendLogic##ext<true>(args); \
|
||||
} else { \
|
||||
if (blendMode == BLEND_ADDITIVE) { \
|
||||
doBlitAdditiveBlendLogic##ext<true>(args); \
|
||||
if (rgbmod) { \
|
||||
if (alphamod) { \
|
||||
doBlitAdditiveBlendLogic##ext<true, true, true>(args); \
|
||||
} else { \
|
||||
doBlitAdditiveBlendLogic##ext<true, true, false>(args); \
|
||||
} \
|
||||
} else { \
|
||||
if (alphamod) { \
|
||||
doBlitAdditiveBlendLogic##ext<true, false, true>(args); \
|
||||
} else { \
|
||||
doBlitAdditiveBlendLogic##ext<true, false, false>(args); \
|
||||
} \
|
||||
} \
|
||||
} else if (blendMode == BLEND_SUBTRACTIVE) { \
|
||||
doBlitSubtractiveBlendLogic##ext<true>(args); \
|
||||
if (rgbmod) { \
|
||||
doBlitSubtractiveBlendLogic##ext<true, true>(args); \
|
||||
} else { \
|
||||
doBlitSubtractiveBlendLogic##ext<true, false>(args); \
|
||||
} \
|
||||
} else if (blendMode == BLEND_MULTIPLY) { \
|
||||
doBlitMultiplyBlendLogic##ext<true>(args); \
|
||||
if (rgbmod) { \
|
||||
if (alphamod) { \
|
||||
doBlitMultiplyBlendLogic##ext<true, true, true>(args); \
|
||||
} else { \
|
||||
doBlitMultiplyBlendLogic##ext<true, true, false>(args); \
|
||||
} \
|
||||
} else { \
|
||||
if (alphamod) { \
|
||||
doBlitMultiplyBlendLogic##ext<true, false, true>(args); \
|
||||
} else { \
|
||||
doBlitMultiplyBlendLogic##ext<true, false, false>(args); \
|
||||
} \
|
||||
} \
|
||||
} else { \
|
||||
assert(blendMode == BLEND_NORMAL); \
|
||||
doBlitAlphaBlendLogic##ext<true>(args); \
|
||||
if (rgbmod) { \
|
||||
if (alphamod) { \
|
||||
doBlitAlphaBlendLogic##ext<true, true, true>(args); \
|
||||
} else { \
|
||||
doBlitAlphaBlendLogic##ext<true, true, false>(args); \
|
||||
} \
|
||||
} else { \
|
||||
if (alphamod) { \
|
||||
doBlitAlphaBlendLogic##ext<true, false, true>(args); \
|
||||
} else { \
|
||||
doBlitAlphaBlendLogic##ext<true, false, false>(args); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
|
@ -19,12 +19,14 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "graphics/blit.h"
|
||||
#include "graphics/pixelformat.h"
|
||||
|
||||
namespace Graphics {
|
||||
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
template<bool doscale>
|
||||
void BlendBlit::doBlitBinaryBlendLogicNEON(Args &args) {
|
||||
(void)args;
|
||||
@ -33,33 +35,143 @@ template<bool doscale>
|
||||
void BlendBlit::doBlitOpaqueBlendLogicNEON(Args &args) {
|
||||
(void)args;
|
||||
}
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod, bool alphamod>
|
||||
void BlendBlit::doBlitMultiplyBlendLogicNEON(Args &args) {
|
||||
(void)args;
|
||||
}
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod>
|
||||
void BlendBlit::doBlitSubtractiveBlendLogicNEON(Args &args) {
|
||||
(void)args;
|
||||
}
|
||||
template<bool doscale>
|
||||
template<bool doscale, bool rgbmod, bool alphamod>
|
||||
void BlendBlit::doBlitAdditiveBlendLogicNEON(Args &args) {
|
||||
(void)args;
|
||||
}
|
||||
template<bool doscale>
|
||||
|
||||
template<bool rgbmod, bool alphamod>
|
||||
static inline uint32x4_t drawPixelAlphaBlend(uint32x4_t src, uint32x4_t dst, const bool flip, const byte ca, const byte cr, const byte cg, const byte cb) {
|
||||
if (flip) {
|
||||
src = vrev64q_u32(src);
|
||||
src = vcombine_u32(vget_high_u32(src), vget_low_u32(src));
|
||||
}
|
||||
uint32x4_t ina;
|
||||
if (alphamod)
|
||||
ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(ca)), 8);
|
||||
else
|
||||
ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
|
||||
uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
|
||||
|
||||
if (rgbmod) {
|
||||
uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
|
||||
uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
|
||||
uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
|
||||
uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
|
||||
uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
|
||||
uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
|
||||
|
||||
dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
|
||||
dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
|
||||
dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
|
||||
srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(cr)), 16));
|
||||
srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(cg)), 16));
|
||||
srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(cb)), 16));
|
||||
src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
|
||||
src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
|
||||
src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
|
||||
} else {
|
||||
uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
|
||||
uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
|
||||
uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
|
||||
uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
|
||||
|
||||
dstRB = vshrq_n_u32(vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
|
||||
dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
|
||||
srcRB = vaddq_u32(dstRB, vshrq_n_u32(vmulq_u32(srcRB, ina), 8));
|
||||
srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
|
||||
src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
|
||||
src = vorrq_u32(vandq_u32(vshlq_n_u32(srcRB, 8), vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
|
||||
}
|
||||
|
||||
dst = vandq_u32(alphaMask, dst);
|
||||
src = vandq_u32(vmvnq_u32(alphaMask), src);
|
||||
return vorrq_u32(dst, src);
|
||||
}
|
||||
template<bool doscale, bool rgbmod, bool alphamod>
|
||||
void BlendBlit::doBlitAlphaBlendLogicNEON(Args &args) {
|
||||
(void)args;
|
||||
const byte *in;
|
||||
byte *out;
|
||||
|
||||
int scaleXCtr, scaleYCtr = 0;
|
||||
const byte *inBase;
|
||||
|
||||
const byte ca = alphamod ? ((args.color >> kAModShift) & 0xFF) : 255;
|
||||
const byte cr = rgbmod ? ((args.color >> kRModShift) & 0xFF) : 255;
|
||||
const byte cg = rgbmod ? ((args.color >> kGModShift) & 0xFF) : 255;
|
||||
const byte cb = rgbmod ? ((args.color >> kBModShift) & 0xFF) : 255;
|
||||
|
||||
for (uint32 i = 0; i < args.height; i++) {
|
||||
if (doscale) {
|
||||
inBase = args.ino + scaleYCtr / SCALE_THRESHOLD * args.inoStep;
|
||||
scaleXCtr = 0;
|
||||
} else {
|
||||
in = args.ino;
|
||||
}
|
||||
out = args.outo;
|
||||
|
||||
uint32 j;
|
||||
for (j = 0; j + 4 < args.width; j += 4) {
|
||||
uint32x4_t dstPixels = vld1q_u32((const uint32 *)out);
|
||||
uint32x4_t srcPixels;
|
||||
if (!doscale) {
|
||||
srcPixels = vld1q_u32((const uint32 *)in);
|
||||
} else {
|
||||
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
|
||||
scaleXCtr += args.scaleX;
|
||||
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
|
||||
scaleXCtr += args.scaleX;
|
||||
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
|
||||
scaleXCtr += args.scaleX;
|
||||
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep), srcPixels, 0);
|
||||
scaleXCtr += args.scaleX;
|
||||
}
|
||||
uint32x4_t res = drawPixelAlphaBlend<rgbmod, alphamod>(srcPixels, dstPixels, args.flipping & FLIP_H, ca, cr, cg, cb);
|
||||
vst1q_u32((uint32 *)out, res);
|
||||
if (!doscale)
|
||||
in += args.inStep * 4;
|
||||
out += 4 * 4;
|
||||
}
|
||||
for (; j < args.width; j++) {
|
||||
if (doscale) {
|
||||
in = inBase + scaleXCtr / SCALE_THRESHOLD * args.inStep;
|
||||
}
|
||||
uint32 ina = in[kAIndex] * ca >> 8;
|
||||
|
||||
if (ina != 0) {
|
||||
uint outb = (out[kBIndex] * (255 - ina) >> 8);
|
||||
uint outg = (out[kGIndex] * (255 - ina) >> 8);
|
||||
uint outr = (out[kRIndex] * (255 - ina) >> 8);
|
||||
|
||||
out[kAIndex] = 255;
|
||||
out[kBIndex] = outb + (in[kBIndex] * ina * cb >> 16);
|
||||
out[kGIndex] = outg + (in[kGIndex] * ina * cg >> 16);
|
||||
out[kRIndex] = outr + (in[kRIndex] * ina * cr >> 16);
|
||||
}
|
||||
|
||||
if (doscale)
|
||||
scaleXCtr += args.scaleX;
|
||||
else
|
||||
in += args.inStep;
|
||||
out += 4;
|
||||
}
|
||||
|
||||
if (doscale)
|
||||
scaleYCtr += args.scaleY;
|
||||
else
|
||||
args.ino += args.inoStep;
|
||||
args.outo += args.dstPitch;
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_BLIT_TEMPLATES(ext, b) \
|
||||
template void BlendBlit::doBlitBinaryBlendLogic##ext<b>(Args &); \
|
||||
template void BlendBlit::doBlitOpaqueBlendLogic##ext<b>(Args &); \
|
||||
template void BlendBlit::doBlitMultiplyBlendLogic##ext<b>(Args &); \
|
||||
template void BlendBlit::doBlitSubtractiveBlendLogic##ext<b>(Args &); \
|
||||
template void BlendBlit::doBlitAdditiveBlendLogic##ext<b>(Args &); \
|
||||
template void BlendBlit::doBlitAlphaBlendLogic##ext<b>(Args &);
|
||||
INSTANTIATE_BLIT_TEMPLATES(NEON, true)
|
||||
INSTANTIATE_BLIT_TEMPLATES(NEON, false)
|
||||
#undef INSTANTIATE_BLIT_TEMPLATES
|
||||
} // end of namespace Graphics
|
||||
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
}
|
||||
|
@ -29,6 +29,8 @@ namespace Common {
|
||||
struct Point;
|
||||
}
|
||||
|
||||
class BlendBlitUnfilteredTestSuite;
|
||||
|
||||
namespace Graphics {
|
||||
|
||||
/**
|
||||
@ -194,29 +196,6 @@ bool setAlpha(byte *dst, const byte *src,
|
||||
// This is a class so that we can declare certain things as private
|
||||
class BlendBlit {
|
||||
private:
|
||||
static const int kBModShift = 8;
|
||||
static const int kGModShift = 16;
|
||||
static const int kRModShift = 24;
|
||||
static const int kAModShift = 0;
|
||||
|
||||
static const uint32 kBModMask = 0x0000ff00;
|
||||
static const uint32 kGModMask = 0x00ff0000;
|
||||
static const uint32 kRModMask = 0xff000000;
|
||||
static const uint32 kAModMask = 0x000000ff;
|
||||
static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
|
||||
|
||||
#ifdef SCUMM_LITTLE_ENDIAN
|
||||
static const int kAIndex = 0;
|
||||
static const int kBIndex = 1;
|
||||
static const int kGIndex = 2;
|
||||
static const int kRIndex = 3;
|
||||
#else
|
||||
static const int kAIndex = 3;
|
||||
static const int kBIndex = 2;
|
||||
static const int kGIndex = 1;
|
||||
static const int kRIndex = 0;
|
||||
#endif
|
||||
|
||||
struct Args {
|
||||
bool rgbmod, alphamod;
|
||||
int xp, yp;
|
||||
@ -246,33 +225,56 @@ private:
|
||||
static void doBlitBinaryBlendLogic##ext(Args &args); \
|
||||
template<bool doscale> \
|
||||
static void doBlitOpaqueBlendLogic##ext(Args &args); \
|
||||
template<bool doscale> \
|
||||
template<bool doscale, bool rgbmod, bool alphamod> \
|
||||
static void doBlitMultiplyBlendLogic##ext(Args &args); \
|
||||
template<bool doscale> \
|
||||
template<bool doscale, bool rgbmod> \
|
||||
static void doBlitSubtractiveBlendLogic##ext(Args &args); \
|
||||
template<bool doscale> \
|
||||
template<bool doscale, bool rgbmod, bool alphamod> \
|
||||
static void doBlitAdditiveBlendLogic##ext(Args &args); \
|
||||
template<bool doscale> \
|
||||
template<bool doscale, bool rgbmod, bool alphamod> \
|
||||
static void doBlitAlphaBlendLogic##ext(Args &args); \
|
||||
static void blit##ext(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType);
|
||||
LOGIC_FUNCS_EXT(Generic)
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
LOGIC_FUNCS_EXT(NEON)
|
||||
#endif
|
||||
LOGIC_FUNCS_EXT(Generic)
|
||||
#undef LOGIC_FUNCS_EXT
|
||||
|
||||
typedef void(*BlitFunc)(Args &, const TSpriteBlendMode &, const AlphaType &);
|
||||
static BlitFunc blitFunc;
|
||||
friend class ::BlendBlitUnfilteredTestSuite;
|
||||
|
||||
public:
|
||||
static const int SCALE_THRESHOLD = 0x100;
|
||||
static const int kBModShift = 8;
|
||||
static const int kGModShift = 16;
|
||||
static const int kRModShift = 24;
|
||||
static const int kAModShift = 0;
|
||||
|
||||
static const uint32 kBModMask = 0x0000ff00;
|
||||
static const uint32 kGModMask = 0x00ff0000;
|
||||
static const uint32 kRModMask = 0xff000000;
|
||||
static const uint32 kAModMask = 0x000000ff;
|
||||
static const uint32 kRGBModMask = (kRModMask | kGModMask | kBModMask);
|
||||
|
||||
#ifdef SCUMM_LITTLE_ENDIAN
|
||||
static const int kAIndex = 0;
|
||||
static const int kBIndex = 1;
|
||||
static const int kGIndex = 2;
|
||||
static const int kRIndex = 3;
|
||||
#else
|
||||
static const int kAIndex = 3;
|
||||
static const int kBIndex = 2;
|
||||
static const int kGIndex = 1;
|
||||
static const int kRIndex = 0;
|
||||
#endif
|
||||
|
||||
static inline int getScaleFactor(int srcSize, int dstSize) {
|
||||
return SCALE_THRESHOLD * srcSize / dstSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the pixel format all operations of TransparentSurface support.
|
||||
* Returns the pixel format all operations of BlendBlit::blit support.
|
||||
*
|
||||
* Use TS_ARGB and TS_RGB to quickly make a color in this format.
|
||||
* TS_ARGB/RGB are found in graphics/transform_struct.h
|
||||
@ -303,7 +305,6 @@ public:
|
||||
const TSpriteBlendMode blendMode,
|
||||
const AlphaType alphaType);
|
||||
|
||||
friend struct TransparentSurface;
|
||||
}; // End of class BlendBlit
|
||||
|
||||
/** @} */
|
||||
|
@ -5,7 +5,6 @@ MODULE_OBJS := \
|
||||
blit.o \
|
||||
blit-alpha.o \
|
||||
blit-scale.o \
|
||||
blit-neon.o \
|
||||
cursorman.o \
|
||||
font.o \
|
||||
fontman.o \
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include "common/fs.h"
|
||||
#include "common/stream.h"
|
||||
#include "common/system.h"
|
||||
|
||||
#include "graphics/surface.h"
|
||||
#include "graphics/managed_surface.h"
|
||||
@ -896,7 +897,103 @@ static bool areSurfacesEqual(const Graphics::Surface *a, const Graphics::Surface
|
||||
|
||||
class BlendBlitUnfilteredTestSuite : public CxxTest::TestSuite {
|
||||
public:
|
||||
void test_blend_speed() {
|
||||
#ifdef TEST_BLEND_SPEED
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitNEON;
|
||||
#else
|
||||
Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
|
||||
#endif
|
||||
|
||||
Graphics::Surface baseSurface, destSurface;
|
||||
baseSurface.create(128, 128, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
|
||||
destSurface.create(256, 256, OldTransparentSurface::OldTransparentSurface::getSupportedPixelFormat());
|
||||
for (int y = 0; y < baseSurface.h; y++) {
|
||||
for (int x = 0; x < baseSurface.w; x++) {
|
||||
int i = x / 4 + y / 4;
|
||||
baseSurface.setPixel(x, y, baseSurface.format.ARGBToColor((i & 16) * 255, (i & 1) * 255, (i & 2) * 255, (i & 4) * 255));
|
||||
}
|
||||
}
|
||||
|
||||
OldTransparentSurface::OldTransparentSurface oldSurf(baseSurface, true);
|
||||
OldTransparentSurface::OldTransparentSurface oldSurfDest(destSurface, true);
|
||||
Graphics::ManagedSurface managedSurf(&baseSurface, DisposeAfterUse::NO);
|
||||
Graphics::ManagedSurface managedSurfDest(&destSurface, DisposeAfterUse::NO);
|
||||
|
||||
int numIters = 0, numItersScaled = 0;
|
||||
double oldTime = 0.0, newTime = 0.0, genericTime = 0.0;
|
||||
double oldTimeScaled = 0.0, newTimeScaled = 0.0, genericTimeScaled = 0.0;
|
||||
const int iters = 2500;
|
||||
|
||||
for (int blendMode = Graphics::BLEND_NORMAL; blendMode < Graphics::BLEND_NORMAL + 1; blendMode++) {
|
||||
for (int alphaType = 0; alphaType <= Graphics::ALPHA_FULL; alphaType++) {
|
||||
for (int flipping = 0; flipping <= 3; flipping++) {
|
||||
oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
|
||||
managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
|
||||
oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
|
||||
uint32 oldStart = g_system->getMillis();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, TS_ARGB(255, 255, 255, 255), -1, -1, (Graphics::TSpriteBlendMode)blendMode);
|
||||
}
|
||||
oldTime += g_system->getMillis() - oldStart;
|
||||
uint32 newStart = g_system->getMillis();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
|
||||
}
|
||||
newTime += g_system->getMillis() - newStart;
|
||||
managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
|
||||
Graphics::BlendBlit::BlitFunc oldFunc = Graphics::BlendBlit::blitFunc;
|
||||
Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
|
||||
uint32 genericStart = g_system->getMillis();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurf.w, managedSurf.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
|
||||
}
|
||||
Graphics::BlendBlit::blitFunc = oldFunc;
|
||||
genericTime += g_system->getMillis() - genericStart;
|
||||
numIters ++;
|
||||
|
||||
// scaled
|
||||
oldSurfDest.fillRect(Common::Rect(0, 0, oldSurfDest.w, oldSurfDest.h), oldSurfDest.format.ARGBToColor(255, 255, 255, 255));
|
||||
managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
|
||||
oldSurf.setAlphaMode((Graphics::AlphaType)alphaType);
|
||||
oldStart = g_system->getMillis();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
oldSurf.blit(oldSurfDest, 0, 0, flipping, nullptr, TS_ARGB(255, 255, 255, 255), oldSurfDest.w, oldSurfDest.h, (Graphics::TSpriteBlendMode)blendMode);
|
||||
}
|
||||
oldTimeScaled += g_system->getMillis() - oldStart;
|
||||
newStart = g_system->getMillis();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
|
||||
}
|
||||
newTimeScaled += g_system->getMillis() - newStart;
|
||||
managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(255, 255, 255, 255));
|
||||
oldFunc = Graphics::BlendBlit::blitFunc;
|
||||
Graphics::BlendBlit::blitFunc = Graphics::BlendBlit::blitGeneric;
|
||||
genericStart = g_system->getMillis();
|
||||
for (int i = 0; i < iters; i++) {
|
||||
managedSurfDest.blendBlitFrom(managedSurf, Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), Common::Rect(0, 0, managedSurf.w, managedSurf.h), flipping, TS_ARGB(255, 255, 255, 255), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
|
||||
}
|
||||
Graphics::BlendBlit::blitFunc = oldFunc;
|
||||
genericTimeScaled += g_system->getMillis() - genericStart;
|
||||
numItersScaled++;
|
||||
} // flipping
|
||||
} // alpha
|
||||
} // blend
|
||||
|
||||
debug("Old TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTime / numIters);
|
||||
debug("New ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTime / numIters);
|
||||
debug("New ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTime / numIters);
|
||||
debug("Old SCALING TransparentSurface::blit avg time per %d iters (in milliseconds): %f\n", iters, oldTimeScaled / numItersScaled);
|
||||
debug("New SCALING ManagedSurface::blendBlitFrom (non SIMD) avg time per %d iters (in milliseconds): %f\n", iters, genericTimeScaled / numItersScaled);
|
||||
debug("New SCALING ManagedSurface::blendBlitFrom avg time per %d iters (in milliseconds): %f\n", iters, newTimeScaled / numItersScaled);
|
||||
debug("Note this speed test puts the old code in the best senario against the new code.");
|
||||
|
||||
baseSurface.free();
|
||||
#endif
|
||||
}
|
||||
|
||||
void test_blend_blit_unfiltered() {
|
||||
#ifdef TEST_BLEND_SPEED
|
||||
Common::Rect dsts[] = {
|
||||
Common::Rect(4, 4, 4+16, 4+16), // Case 0 (source clipping)
|
||||
Common::Rect(24, 20, 24+16, 20+16), // Case 1 (outside of destination)
|
||||
@ -971,7 +1068,7 @@ public:
|
||||
newSurf.setAlphaMode((Graphics::AlphaType)alphaType);
|
||||
newSurf.blit(newSurfDest, dsts[rect].left, dsts[rect].top, flipping, &srcs[rect], TS_ARGB(a, r, g, b), dsts[rect].width(), dsts[rect].height(), (Graphics::TSpriteBlendMode)blendMode);
|
||||
managedSurfDest.fillRect(Common::Rect(0, 0, managedSurfDest.w, managedSurfDest.h), managedSurfDest.format.ARGBToColor(ba, br, bg, bb));
|
||||
managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, BLENDBLIT_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
|
||||
managedSurfDest.blendBlitFrom(managedSurf, srcs[rect], dsts[rect], flipping, TS_ARGB(a, r, g, b), (Graphics::TSpriteBlendMode)blendMode, (Graphics::AlphaType)alphaType);
|
||||
|
||||
|
||||
|
||||
@ -1019,5 +1116,6 @@ public:
|
||||
} // blend
|
||||
|
||||
baseSurface.free();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
@ -18,6 +18,10 @@ void BaseBackend::initBackend() {
|
||||
OSystem::initBackend();
|
||||
}
|
||||
|
||||
bool BaseBackend::hasFeature(OSystem::Feature f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void BaseBackend::fillScreen(uint32 col) {
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user