mirror of
https://github.com/libretro/scummvm.git
synced 2025-01-10 11:51:52 +00:00
AGS: Preliminary optimizations of blending funcs
Just commiting my first attempts at optimizing BITMAP::draw and the blendPixel function. Here's an overview of the changes (some are temporary): - Put the loop of BITMAP::draw into its own function drawInner. I templated it so that I could put different paths into the loop that could be optimized out at compile time if a certain blending function didn't need it etc. - I added apple NEON (SIMD) intrensics to the drawInner function, haven't ported it to SSE yet, but there is a small library that actually maps neon intrensics to sse ones. - Removed a few ifs from the normal x loop and put it in the y loop.
This commit is contained in:
parent
1e4a03313c
commit
aa9d13f84a
BIN
benchgfx1.bmp
BIN
benchgfx1.bmp
Binary file not shown.
Before Width: | Height: | Size: 16 KiB After Width: | Height: | Size: 16 KiB |
@ -815,6 +815,7 @@ void allegro_bitmap_test_init() {
|
||||
Bitmap *dest = BitmapHelper::CreateBitmap(100, 100, benchgfx1->GetColorDepth());
|
||||
uint64_t bench_runs[] = {1000, 10000, 100000};
|
||||
if (benchgfx1 != nullptr) {
|
||||
_G(_blender_mode) = kRgbToRgbBlender; // Using normal blender mode
|
||||
for (long unsigned int i = 0; i < sizeof(bench_runs)/sizeof(uint64_t); i++) {
|
||||
Debug::Printf(kDbgMsg_Info, "Starting Allegro Bitmap Test Bench 2 (%d bpp)", benchgfx1->GetColorDepth());
|
||||
uint32_t start = std::chrono::high_resolution_clock::now();
|
||||
|
@ -163,8 +163,13 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
|
||||
int yStart = (dstRect.top < destRect.top) ? dstRect.top - destRect.top : 0;
|
||||
|
||||
#define DRAWINNER(formattype) drawInner<formattype>(yStart, xStart, transColor, alphaMask, palette, useTint, sameFormat, src, destArea, horizFlip, vertFlip, skipTrans, srcAlpha, tintRed, tintGreen, tintBlue, dstRect, srcArea)
|
||||
if (sameFormat && format.bytesPerPixel == 4) DRAWINNER(1);
|
||||
else DRAWINNER(0);
|
||||
if (sameFormat && format.bytesPerPixel == 4 && _G(_blender_mode) == kRgbToRgbBlender) {
|
||||
if (format.bShift == 0 && format.gShift == 8 && format.rShift == 16) DRAWINNER(1);
|
||||
else DRAWINNER(0);
|
||||
}
|
||||
else {
|
||||
DRAWINNER(0);
|
||||
}
|
||||
#undef DRAWINNER
|
||||
}
|
||||
|
||||
@ -271,8 +276,8 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
|
||||
bDest = bSrc;
|
||||
} else {
|
||||
// TODO: move this to blendPixel to only do it when needed?
|
||||
format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
|
||||
// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, false, destVal);
|
||||
}
|
||||
|
||||
uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
|
||||
@ -284,30 +289,37 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
|
||||
}
|
||||
}
|
||||
|
||||
void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
|
||||
void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
|
||||
switch (_G(_blender_mode)) {
|
||||
case kSourceAlphaBlender:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendSourceAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kArgbToArgbBlender:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendArgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kArgbToRgbBlender:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendArgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kRgbToArgbBlender:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendRgbToArgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kRgbToRgbBlender:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendRgbToRgb(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kAlphaPreservedBlenderMode:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendPreserveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kOpaqueBlenderMode:
|
||||
blendOpaque(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kAdditiveBlenderMode:
|
||||
if (!useTint && alpha != 255) format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
blendAdditiveAlpha(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, alpha);
|
||||
break;
|
||||
case kTintBlenderMode:
|
||||
|
@ -27,6 +27,11 @@
|
||||
#include "ags/lib/allegro/color.h"
|
||||
#include "common/array.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
// M1/M2 SIMD intrensics
|
||||
#include "arm_neon.h"
|
||||
#endif
|
||||
|
||||
namespace AGS3 {
|
||||
|
||||
class BITMAP {
|
||||
@ -131,7 +136,7 @@ public:
|
||||
// unsigned int blender_func(unsigned long x, unsigned long y, unsigned long n)
|
||||
// when x is the sprite color, y the destination color, and n an alpha value
|
||||
|
||||
void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const;
|
||||
void blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const;
|
||||
|
||||
|
||||
inline void rgbBlend(uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha) const {
|
||||
@ -280,12 +285,19 @@ public:
|
||||
horizFlip ? srcArea.right - 1 : srcArea.left,
|
||||
vertFlip ? srcArea.bottom - 1 - yCtr :
|
||||
srcArea.top + yCtr);
|
||||
int destX = xStart, xCtr = 0, xCtrBpp = 0, xCtrWidth = dstRect.width();
|
||||
if (xStart < 0) {
|
||||
xCtr = -xStart;
|
||||
xCtrBpp = xCtr * src.format.bytesPerPixel;
|
||||
destX = 0;
|
||||
}
|
||||
if (xStart + xCtrWidth > destArea.w) {
|
||||
xCtrWidth = destArea.w - xStart;
|
||||
}
|
||||
|
||||
if (FormatType == 0) {
|
||||
// Loop through the pixels of the row
|
||||
for (int destX = xStart, xCtr = 0, xCtrBpp = 0; xCtr < dstRect.width(); ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
|
||||
if (destX < 0 || destX >= destArea.w)
|
||||
continue;
|
||||
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
|
||||
const byte *srcVal = srcP + xDir * xCtrBpp;
|
||||
uint32 srcCol = getColor(srcVal, src.format.bytesPerPixel);
|
||||
|
||||
@ -315,14 +327,14 @@ public:
|
||||
gSrc = rgb.g;
|
||||
bSrc = rgb.b;
|
||||
} else {
|
||||
if (FormatType == 1) {
|
||||
aSrc = srcCol >> src.format.aShift & 0xff;
|
||||
rSrc = srcCol >> src.format.rShift & 0xff;
|
||||
gSrc = srcCol >> src.format.gShift & 0xff;
|
||||
bSrc = srcCol >> src.format.bShift & 0xff;
|
||||
} else {
|
||||
// if (FormatType == 1) {
|
||||
// aSrc = srcCol >> src.format.aShift & 0xff;
|
||||
// rSrc = srcCol >> src.format.rShift & 0xff;
|
||||
// gSrc = srcCol >> src.format.gShift & 0xff;
|
||||
// bSrc = srcCol >> src.format.bShift & 0xff;
|
||||
// } else {
|
||||
src.format.colorToARGB(srcCol, aSrc, rSrc, gSrc, bSrc);
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
if (srcAlpha == -1) {
|
||||
@ -343,9 +355,9 @@ public:
|
||||
aSrc = srcAlpha;
|
||||
} else {
|
||||
// TODO: move this to blendPixel to only do it when needed?
|
||||
format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
// format.colorToARGB(getColor(destVal, format.bytesPerPixel), aDest, rDest, gDest, bDest);
|
||||
}
|
||||
blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha);
|
||||
blendPixel(aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest, srcAlpha, useTint, destVal);
|
||||
}
|
||||
|
||||
uint32 pixel = format.ARGBToColor(aDest, rDest, gDest, bDest);
|
||||
@ -353,7 +365,53 @@ public:
|
||||
*(uint32 *)destVal = pixel;
|
||||
else
|
||||
*(uint16 *)destVal = pixel;
|
||||
} // FormatType == 0
|
||||
} else { // FormatType == 1
|
||||
uint32x4_t maskedAlphas = vld1q_dup_u32(&alphaMask);
|
||||
uint32x4_t transColors = vld1q_dup_u32(&transColor);
|
||||
uint32 alpha = srcAlpha ? srcAlpha + 1 : srcAlpha;
|
||||
uint8x16_t srcCols;
|
||||
for (; xCtr + 4 < dstRect.width(); destX += 4, xCtr += 4, xCtrBpp += src.format.bytesPerPixel*4) {
|
||||
uint32 *destPtr = (uint32 *)&destP[destX * format.bytesPerPixel];
|
||||
if (srcAlpha != -1) {
|
||||
uint8x16_t srcColsRaw = vld1q_u8(srcP + xDir * xCtrBpp);
|
||||
uint8x16_t destColsRaw = vld1q_u8((uint8 *)destPtr);
|
||||
uint8x16_t diff = vqsubq_u32(srcColsRaw, destColsRaw);
|
||||
diff = vmulq_u8(diff, vmovq_n_u8(alpha));
|
||||
diff = vshrq_n_u8(diff, 8);
|
||||
diff = vaddq_u8(diff, destColsRaw);
|
||||
srcCols = vld1q_u32((const uint32 *)&diff);
|
||||
} else {
|
||||
srcCols = vld1q_u32((const uint32 *)(srcP + xDir * xCtrBpp));
|
||||
}
|
||||
uint32x4_t anded = vandq_u32(srcCols, maskedAlphas);
|
||||
uint32x4_t mask1 = skipTrans ? vceqq_u32(anded, transColors) : vmovq_n_u32(0);
|
||||
if (srcAlpha != -1) mask1 = vorrq_u32(mask1, vmovq_n_u32(0xff000000));
|
||||
uint32x4_t mask2 = vmvnq_u32(mask1);
|
||||
uint32x4_t destCols2 = vandq_u32(vld1q_u32(destPtr), mask1);
|
||||
uint32x4_t srcCols2 = vandq_u32(srcCols, mask2);
|
||||
uint32x4_t final = vorrq_u32(destCols2, srcCols2);
|
||||
vst1q_u32(destPtr, final);
|
||||
}
|
||||
// Get the last x values
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += src.format.bytesPerPixel) {
|
||||
const uint32 *srcCol = (const uint32 *)(srcP + xDir * xCtrBpp);
|
||||
// Check if this is a transparent color we should skip
|
||||
if (skipTrans && ((*srcCol & alphaMask) == transColor))
|
||||
continue;
|
||||
|
||||
byte *destVal = (byte *)&destP[destX * format.bytesPerPixel];
|
||||
uint32 destCol = srcAlpha == -1 ? *srcCol : *(uint32 *)destVal;
|
||||
if (srcAlpha != -1) {
|
||||
//uint8 aSrc, rSrc, gSrc, bSrc, aDest, rDest, gDest, bDest;
|
||||
format.colorToARGB(destCol, aDest, rDest, gDest, bDest);
|
||||
src.format.colorToARGB(*srcCol, aSrc, rSrc, gSrc, bSrc);
|
||||
rgbBlend(rSrc, gSrc, bSrc, rDest, gDest, bDest, srcAlpha);
|
||||
destCol = format.ARGBToColor(aDest, rDest, gDest, bDest);
|
||||
}
|
||||
*(uint32 *)destVal = destCol;
|
||||
}
|
||||
} // FormatType == 1
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "common/scummsys.h"
|
||||
#include "common/endian.h"
|
||||
#include "common/list.h"
|
||||
#include "common/textconsole.h"
|
||||
|
||||
namespace Common {
|
||||
struct Rect;
|
||||
@ -124,7 +125,7 @@ public:
|
||||
*
|
||||
* @param newPixels The new pixel data.
|
||||
*/
|
||||
void setPixels(void *newPixels) { pixels = newPixels; }
|
||||
void setPixels(void *newPixels) { if ((unsigned long long)newPixels & 0xf) warning("unaligned pixels!"); pixels = newPixels; }
|
||||
|
||||
/**
|
||||
* Return a pointer to the pixel at the specified point.
|
||||
|
Loading…
Reference in New Issue
Block a user