AGS: GRAPHICS: Changed bending functions templates

This commit is contained in:
Wyatt Radkiewicz 2023-08-20 10:04:38 -06:00 committed by Eugene Sandulenko
parent 153afb1081
commit bc20c0185d
4 changed files with 118 additions and 142 deletions

View File

@ -104,9 +104,8 @@ void BITMAP::floodfill(int x, int y, int color) {
AGS3::floodfill(this, x, y, color);
}
const int SCALE_THRESHOLD = 0x100;
#define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
const int xDir = args.horizFlip ? -1 : 1;
byte rSrc, gSrc, bSrc, aSrc;
@ -127,9 +126,9 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
if (args.yStart < 0) { // Clip the top
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) { // Clip the bottom
@ -142,8 +141,8 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
args.vertFlip ? args.srcArea.bottom - 1 - yCtr :
args.srcArea.top + yCtr);
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
if (ScaleThreshold != 0) {
int newSrcYCtr = scaleYCtr / ScaleThreshold;
if (Scale) {
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
if (srcYCtr != newSrcYCtr) {
int diffSrcYCtr = newSrcYCtr - srcYCtr;
srcP += args.src.pitch * diffSrcYCtr;
@ -153,8 +152,8 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
// Loop through the pixels of the row
for (int destX = args.xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * args.scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += args.scaleX) {
const byte *srcVal = srcP + xDir * xCtrBpp;
if (ScaleThreshold != 0) {
srcVal = srcP + (scaleXCtr / ScaleThreshold) * SrcBytesPerPixel;
if (Scale) {
srcVal = srcP + (scaleXCtr / SCALE_THRESHOLD) * SrcBytesPerPixel;
}
uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
@ -233,7 +232,7 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
}
destP += args.destArea.pitch;
if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
}
}
@ -312,34 +311,34 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
if (sameFormat) {
switch (format.bytesPerPixel) {
case 1: DRAWINNER((drawInnerGeneric<1, 1, 0>)); return;
case 2: DRAWINNER((drawInnerGeneric<2, 2, 0>)); return;
case 4: DRAWINNER((drawInnerGeneric<4, 4, 0>)); return;
case 1: DRAWINNER((drawInnerGeneric<1, 1, false>)); return;
case 2: DRAWINNER((drawInnerGeneric<2, 2, false>)); return;
case 4: DRAWINNER((drawInnerGeneric<4, 4, false>)); return;
}
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
DRAWINNER((drawInnerGeneric<4, 2, 0>));
DRAWINNER((drawInnerGeneric<4, 2, false>));
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
DRAWINNER((drawInnerGeneric<2, 4, 0>));
DRAWINNER((drawInnerGeneric<2, 4, false>));
}
} else {
if (sameFormat) {
switch (format.bytesPerPixel) {
case 1: DRAWINNER(drawInner1Bpp<0>); return;
case 2: DRAWINNER(drawInner2Bpp<0>); return;
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, 0>)); return;
case 1: DRAWINNER(drawInner1Bpp<false>); return;
case 2: DRAWINNER(drawInner2Bpp<false>); return;
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, false>)); return;
}
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
DRAWINNER((drawInner4BppWithConv<4, 2, 0>));
DRAWINNER((drawInner4BppWithConv<4, 2, false>));
return;
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
DRAWINNER((drawInner4BppWithConv<2, 4, 0>));
DRAWINNER((drawInner4BppWithConv<2, 4, false>));
return;
}
}
if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
DRAWINNER((drawInnerGeneric<4, 1, 0>));
DRAWINNER((drawInnerGeneric<4, 1, false>));
else
DRAWINNER((drawInnerGeneric<2, 1, 0>));
DRAWINNER((drawInnerGeneric<2, 1, false>));
#undef DRAWINNER
}
@ -395,34 +394,34 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
if (sameFormat) {
switch (format.bytesPerPixel) {
case 1: DRAWINNER((drawInnerGeneric<1, 1, SCALE_THRESHOLD>)); return;
case 2: DRAWINNER((drawInnerGeneric<2, 2, SCALE_THRESHOLD>)); return;
case 4: DRAWINNER((drawInnerGeneric<4, 4, SCALE_THRESHOLD>)); return;
case 1: DRAWINNER((drawInnerGeneric<1, 1, true>)); return;
case 2: DRAWINNER((drawInnerGeneric<2, 2, true>)); return;
case 4: DRAWINNER((drawInnerGeneric<4, 4, true>)); return;
}
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
DRAWINNER((drawInnerGeneric<4, 2, SCALE_THRESHOLD>));
DRAWINNER((drawInnerGeneric<4, 2, true>));
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
DRAWINNER((drawInnerGeneric<2, 4, SCALE_THRESHOLD>));
DRAWINNER((drawInnerGeneric<2, 4, true>));
}
} else {
if (sameFormat) {
switch (format.bytesPerPixel) {
case 1: DRAWINNER(drawInner1Bpp<SCALE_THRESHOLD>); return;
case 2: DRAWINNER(drawInner2Bpp<SCALE_THRESHOLD>); return;
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, SCALE_THRESHOLD>)); return;
case 1: DRAWINNER(drawInner1Bpp<true>); return;
case 2: DRAWINNER(drawInner2Bpp<true>); return;
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, true>)); return;
}
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
DRAWINNER((drawInner4BppWithConv<4, 2, SCALE_THRESHOLD>));
DRAWINNER((drawInner4BppWithConv<4, 2, true>));
return;
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
DRAWINNER((drawInner4BppWithConv<2, 4, SCALE_THRESHOLD>));
DRAWINNER((drawInner4BppWithConv<2, 4, true>));
return;
}
}
if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
DRAWINNER((drawInnerGeneric<4, 1, SCALE_THRESHOLD>));
DRAWINNER((drawInnerGeneric<4, 1, true>));
else
DRAWINNER((drawInnerGeneric<2, 1, SCALE_THRESHOLD>));
DRAWINNER((drawInnerGeneric<2, 1, true>));
#undef DRAWINNER
}
void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {

View File

@ -265,6 +265,8 @@ public:
// kTintBlenderMode and kTintLightBlenderMode
void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
constexpr static int SCALE_THRESHOLD_BITS = 8;
constexpr static int SCALE_THRESHOLD = 1 << SCALE_THRESHOLD_BITS;
struct DrawInnerArgs {
bool useTint, sameFormat, horizFlip, vertFlip, skipTrans, doScale;
int xStart, yStart, srcAlpha, tintRed, tintGreen, tintBlue, scaleX, scaleY;
@ -280,13 +282,13 @@ public:
DrawInnerArgs(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
};
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
void drawInner4BppWithConv(DrawInnerArgs &args);
template<int ScaleThreshold>
template<bool Scale>
void drawInner2Bpp(DrawInnerArgs &args);
template<int ScaleThreshold>
template<bool Scale>
void drawInner1Bpp(DrawInnerArgs &args);
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
void drawInnerGeneric(DrawInnerArgs &args);
inline uint32 getColor(const byte *data, byte bpp) const {

View File

@ -13,7 +13,7 @@
namespace AGS3 {
// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
const int xDir = args.horizFlip ? -1 : 1;
byte rSrc, gSrc, bSrc, aSrc;
@ -45,13 +45,13 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
args.xStart = 0;
}
int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
if (Scale) yCtrHeight = args.dstRect.height();
if (args.yStart < 0) {
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) {
@ -65,8 +65,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
if (ScaleThreshold == 0) {
// If we are not scaling the image
if (!Scale) {
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
byte *destPtr = &destP[destX * DestBytesPerPixel];
// Skip pixels that are beyond the row
@ -78,7 +77,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
} else {
// Here we are scaling the image
int newSrcYCtr = scaleYCtr / ScaleThreshold;
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
// Since the source yctr might not update every row of the destination, we have
// to see if we are on a new row...
if (srcYCtr != newSrcYCtr) {
@ -94,12 +93,8 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
// Calculate in parallel the indexes of the pixels
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
#else
#error Change code to allow different scale threshold!
#endif
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), SrcBytesPerPixel);
// Simply memcpy them in. memcpy has no real performance overhead here
memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
@ -132,7 +127,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
if (ScaleThreshold == 0) {
if (!Scale) {
for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
@ -150,8 +145,8 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
// For the last 4 pixels, we just do them in serial, nothing special
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
if (ScaleThreshold != 0) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
if (Scale) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
}
byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
@ -184,7 +179,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
}
}
template<int ScaleThreshold>
template<bool Scale>
void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
const int xDir = args.horizFlip ? -1 : 1;
byte rSrc, gSrc, bSrc, aSrc;
@ -213,13 +208,13 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
args.xStart = 0;
}
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
if (Scale) yCtrHeight = args.dstRect.height();
if (args.yStart < 0) {
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) {
@ -232,7 +227,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
if (ScaleThreshold == 0) {
if (!Scale) {
// If we are not scaling the image
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
byte *destPtr = &destP[destX * 2];
@ -245,7 +240,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
} else {
// Here we are scaling the image
int newSrcYCtr = scaleYCtr / ScaleThreshold;
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
// Since the source yctr might not update every row of the destination, we have
// to see if we are on a new row...
if (srcYCtr != newSrcYCtr) {
@ -261,13 +256,9 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
// Calculate in parallel the indexes of the pixels
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
#else
#error Change code to allow different scale threshold!
#endif
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 2);
// Simply memcpy them in. memcpy has no real performance overhead here
srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
@ -304,7 +295,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
if (ScaleThreshold == 0) {
if (!Scale) {
for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
byte *destPtr = &destP[destX * 2];
drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
@ -322,8 +313,8 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
// For the last 4 pixels, we just do them in serial, nothing special
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
if (ScaleThreshold != 0) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
if (Scale) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
}
byte *destVal = (byte *)&destP[destX * 2];
uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
@ -355,7 +346,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
}
}
template<int ScaleThreshold>
template<bool Scale>
void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
const int xDir = args.horizFlip ? -1 : 1;
uint8x16_t transColors = vld1q_dup_u8(&args.transColor);
@ -377,13 +368,13 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
args.xStart = 0;
}
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
if (Scale) yCtrHeight = args.dstRect.height();
if (args.yStart < 0) {
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) {
@ -395,10 +386,10 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
if (ScaleThreshold != 0) {
if (Scale) {
// So here we update the srcYCtr differently due to this being for
// scaling
int newSrcYCtr = scaleYCtr / ScaleThreshold;
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
if (srcYCtr != newSrcYCtr) {
// Since the source yctr might not update every row of the destination, we have
// to see if we are on a new row...
@ -415,18 +406,14 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
// can't have any blending applied to them
uint8x16_t destCols = vld1q_u8(destPtr);
uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
if (ScaleThreshold != 0) {
if (Scale) {
// If we are scaling, we have to set each pixel individually
uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), 8);
indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8);
indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), 8);
indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), 8);
#else
#error Change code to allow different scale threshold!
#endif
indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
@ -462,8 +449,8 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
if (args.horizFlip) srcP += 15;
for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
if (ScaleThreshold != 0) {
srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
if (Scale) {
srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
}
// Check if this is a transparent color we should skip
if (args.skipTrans && *srcCol == args.transColor)
@ -475,21 +462,21 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
if (args.horizFlip) srcP -= 15; // Undo what we did up there
destP += args.destArea.pitch; // Goto next row
// Only advance the src row by 1 every time like this if we don't scale
if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
}
}
template void BITMAP::drawInner4BppWithConv<4, 4, 0>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 2, 0>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<2, 4, 0>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(DrawInnerArgs &args);
template void BITMAP::drawInner2Bpp<0>(DrawInnerArgs &args);
template void BITMAP::drawInner2Bpp<0x100>(DrawInnerArgs &args);
template void BITMAP::drawInner1Bpp<0>(DrawInnerArgs &args);
template void BITMAP::drawInner1Bpp<0x100>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 4, false>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 4, true>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 2, false>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<4, 2, true>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<2, 4, false>(DrawInnerArgs &args);
template void BITMAP::drawInner4BppWithConv<2, 4, true>(DrawInnerArgs &args);
template void BITMAP::drawInner2Bpp<false>(DrawInnerArgs &args);
template void BITMAP::drawInner2Bpp<true>(DrawInnerArgs &args);
template void BITMAP::drawInner1Bpp<false>(DrawInnerArgs &args);
template void BITMAP::drawInner1Bpp<true>(DrawInnerArgs &args);
} // namespace AGS3

View File

@ -25,7 +25,7 @@ inline uint32 extract32_idx3(__m128i x) {
}
// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
const int xDir = args.horizFlip ? -1 : 1;
byte rSrc, gSrc, bSrc, aSrc;
@ -57,13 +57,13 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
args.xStart = 0;
}
int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
if (Scale) yCtrHeight = args.dstRect.height();
if (args.yStart < 0) {
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) {
@ -77,7 +77,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
if (ScaleThreshold == 0) {
if (!Scale) {
// If we are not scaling the image
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
byte *destPtr = &destP[destX * DestBytesPerPixel];
@ -90,7 +90,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
} else {
// Here we are scaling the image
int newSrcYCtr = scaleYCtr / ScaleThreshold;
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
// Since the source yctr might not update every row of the destination, we have
// to see if we are on a new row...
if (srcYCtr != newSrcYCtr) {
@ -106,15 +106,11 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
__m128i indexes = _mm_set1_epi32(scaleXCtr);
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
// Calculate in parallel the indexes of the pixels
if (SrcBytesPerPixel == 4)
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 2);
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
else
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
#else
#error Change code to allow different scale threshold!
#endif
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
// Simply memcpy them in. memcpy has no real performance overhead here
memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + extract32_idx0(indexes), SrcBytesPerPixel);
memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
@ -147,7 +143,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
if (ScaleThreshold == 0) {
if (!Scale) {
for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
@ -165,8 +161,8 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
// For the last 4 pixels, we just do them in serial, nothing special
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
if (ScaleThreshold != 0) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
if (Scale) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
}
byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
@ -228,13 +224,13 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
args.xStart = 0;
}
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
if (Scale) yCtrHeight = args.dstRect.height();
if (args.yStart < 0) {
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) {
@ -247,7 +243,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
if (ScaleThreshold == 0) {
if (!Scale) {
// If we are not scaling the image
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
byte *destPtr = &destP[destX * 2];
@ -260,7 +256,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
} else {
// Here we are scaling the image
int newSrcYCtr = scaleYCtr / ScaleThreshold;
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
// Since the source yctr might not update every row of the destination, we have
// to see if we are on a new row...
if (srcYCtr != newSrcYCtr) {
@ -276,13 +272,9 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
// Calculate in parallel the indexes of the pixels
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8), 1);
#else
#error Change code to allow different scale threshold!
#endif
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 1);
// Simply memcpy them in. memcpy has no real performance overhead here
srcBuffer[0] = *(const uint16 *)(srcP + extract32_idx0(indexes));
srcBuffer[1] = *(const uint16 *)(srcP + extract32_idx1(indexes));
@ -319,7 +311,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
// Drawing the last few not scaled pixels here.
// Same as the loop above but now we check if we are going to overflow,
// and thus we don't need to mask out pixels that go over the row.
if (ScaleThreshold == 0) {
if (!Scale) {
for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
byte *destPtr = &destP[destX * 2];
drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
@ -337,8 +329,8 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
// For the last 4 pixels, we just do them in serial, nothing special
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
if (ScaleThreshold != 0) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
if (Scale) {
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
}
byte *destVal = (byte *)&destP[destX * 2];
uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
@ -370,7 +362,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
}
}
template<int ScaleThreshold>
template<bool Scale>
void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
const int xDir = args.horizFlip ? -1 : 1;
__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
@ -392,13 +384,13 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
args.xStart = 0;
}
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
if (Scale) yCtrHeight = args.dstRect.height();
if (args.yStart < 0) {
yCtr = -args.yStart;
destY = 0;
if (ScaleThreshold != 0) {
if (Scale) {
scaleYCtr = yCtr * args.scaleY;
srcYCtr = scaleYCtr / ScaleThreshold;
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
}
}
if (args.yStart + yCtrHeight > args.destArea.h) {
@ -410,10 +402,10 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
if (ScaleThreshold != 0) {
if (Scale) {
// So here we update the srcYCtr differently due to this being for
// scaling
int newSrcYCtr = scaleYCtr / ScaleThreshold;
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
if (srcYCtr != newSrcYCtr) {
// Since the source yctr might not update every row of the destination, we have
// to see if we are on a new row...
@ -430,19 +422,15 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
// can't have any blending applied to them
__m128i destCols = _mm_loadu_si128((const __m128i *)destPtr);
__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP + xDir * xCtr));
if (ScaleThreshold != 0) {
if (Scale) {
// If we are scaling, we have to set each pixel individually
__m128i indexes1 = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
__m128i indexes3 = _mm_set1_epi32(scaleXCtr), indexes4 = _mm_set1_epi32(scaleXCtr);
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
// Calculate in parallel the indexes of the pixels
indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), 8);
indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8);
indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), 8);
indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), 8);
#else
#error Change code to allow different scale threshold!
#endif
indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
srcCols = _mm_set_epi8(
srcP[extract32_idx3(indexes4)],
srcP[extract32_idx2(indexes4)],
@ -482,8 +470,8 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
if (args.horizFlip) srcP += 15;
for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
if (ScaleThreshold != 0) {
srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
if (Scale) {
srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
}
// Check if this is a transparent color we should skip
if (args.skipTrans && *srcCol == args.transColor)
@ -495,7 +483,7 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
if (args.horizFlip) srcP -= 15; // Undo what we did up there
destP += args.destArea.pitch; // Goto next row
// Only advance the src row by 1 every time like this if we don't scale
if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
}
}