mirror of
https://github.com/libretro/scummvm.git
synced 2025-01-10 11:51:52 +00:00
AGS: GRAPHICS: Changed bending functions templates
This commit is contained in:
parent
153afb1081
commit
bc20c0185d
@ -104,9 +104,8 @@ void BITMAP::floodfill(int x, int y, int color) {
|
||||
AGS3::floodfill(this, x, y, color);
|
||||
}
|
||||
|
||||
const int SCALE_THRESHOLD = 0x100;
|
||||
#define VGA_COLOR_TRANS(x) ((x) * 255 / 63)
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
|
||||
void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
|
||||
const int xDir = args.horizFlip ? -1 : 1;
|
||||
byte rSrc, gSrc, bSrc, aSrc;
|
||||
@ -127,9 +126,9 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
|
||||
if (args.yStart < 0) { // Clip the top
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) { // Clip the bottom
|
||||
@ -142,8 +141,8 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
|
||||
args.vertFlip ? args.srcArea.bottom - 1 - yCtr :
|
||||
args.srcArea.top + yCtr);
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
if (ScaleThreshold != 0) {
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
if (Scale) {
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
int diffSrcYCtr = newSrcYCtr - srcYCtr;
|
||||
srcP += args.src.pitch * diffSrcYCtr;
|
||||
@ -153,8 +152,8 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
|
||||
// Loop through the pixels of the row
|
||||
for (int destX = args.xStart, xCtr = xCtrStart, xCtrBpp = xCtrBppStart, scaleXCtr = xCtr * args.scaleX; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel, scaleXCtr += args.scaleX) {
|
||||
const byte *srcVal = srcP + xDir * xCtrBpp;
|
||||
if (ScaleThreshold != 0) {
|
||||
srcVal = srcP + (scaleXCtr / ScaleThreshold) * SrcBytesPerPixel;
|
||||
if (Scale) {
|
||||
srcVal = srcP + (scaleXCtr / SCALE_THRESHOLD) * SrcBytesPerPixel;
|
||||
}
|
||||
uint32 srcCol = getColor(srcVal, SrcBytesPerPixel);
|
||||
|
||||
@ -233,7 +232,7 @@ void BITMAP::drawInnerGeneric(DrawInnerArgs &args) {
|
||||
}
|
||||
|
||||
destP += args.destArea.pitch;
|
||||
if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
}
|
||||
}
|
||||
|
||||
@ -312,34 +311,34 @@ void BITMAP::draw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
|
||||
if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
|
||||
if (sameFormat) {
|
||||
switch (format.bytesPerPixel) {
|
||||
case 1: DRAWINNER((drawInnerGeneric<1, 1, 0>)); return;
|
||||
case 2: DRAWINNER((drawInnerGeneric<2, 2, 0>)); return;
|
||||
case 4: DRAWINNER((drawInnerGeneric<4, 4, 0>)); return;
|
||||
case 1: DRAWINNER((drawInnerGeneric<1, 1, false>)); return;
|
||||
case 2: DRAWINNER((drawInnerGeneric<2, 2, false>)); return;
|
||||
case 4: DRAWINNER((drawInnerGeneric<4, 4, false>)); return;
|
||||
}
|
||||
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
|
||||
DRAWINNER((drawInnerGeneric<4, 2, 0>));
|
||||
DRAWINNER((drawInnerGeneric<4, 2, false>));
|
||||
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
|
||||
DRAWINNER((drawInnerGeneric<2, 4, 0>));
|
||||
DRAWINNER((drawInnerGeneric<2, 4, false>));
|
||||
}
|
||||
} else {
|
||||
if (sameFormat) {
|
||||
switch (format.bytesPerPixel) {
|
||||
case 1: DRAWINNER(drawInner1Bpp<0>); return;
|
||||
case 2: DRAWINNER(drawInner2Bpp<0>); return;
|
||||
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, 0>)); return;
|
||||
case 1: DRAWINNER(drawInner1Bpp<false>); return;
|
||||
case 2: DRAWINNER(drawInner2Bpp<false>); return;
|
||||
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, false>)); return;
|
||||
}
|
||||
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
|
||||
DRAWINNER((drawInner4BppWithConv<4, 2, 0>));
|
||||
DRAWINNER((drawInner4BppWithConv<4, 2, false>));
|
||||
return;
|
||||
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
|
||||
DRAWINNER((drawInner4BppWithConv<2, 4, 0>));
|
||||
DRAWINNER((drawInner4BppWithConv<2, 4, false>));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
|
||||
DRAWINNER((drawInnerGeneric<4, 1, 0>));
|
||||
DRAWINNER((drawInnerGeneric<4, 1, false>));
|
||||
else
|
||||
DRAWINNER((drawInnerGeneric<2, 1, 0>));
|
||||
DRAWINNER((drawInnerGeneric<2, 1, false>));
|
||||
#undef DRAWINNER
|
||||
}
|
||||
|
||||
@ -395,34 +394,34 @@ void BITMAP::stretchDraw(const BITMAP *srcBitmap, const Common::Rect &srcRect,
|
||||
if (_G(simd_flags) == AGS3::Globals::SIMD_NONE) {
|
||||
if (sameFormat) {
|
||||
switch (format.bytesPerPixel) {
|
||||
case 1: DRAWINNER((drawInnerGeneric<1, 1, SCALE_THRESHOLD>)); return;
|
||||
case 2: DRAWINNER((drawInnerGeneric<2, 2, SCALE_THRESHOLD>)); return;
|
||||
case 4: DRAWINNER((drawInnerGeneric<4, 4, SCALE_THRESHOLD>)); return;
|
||||
case 1: DRAWINNER((drawInnerGeneric<1, 1, true>)); return;
|
||||
case 2: DRAWINNER((drawInnerGeneric<2, 2, true>)); return;
|
||||
case 4: DRAWINNER((drawInnerGeneric<4, 4, true>)); return;
|
||||
}
|
||||
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
|
||||
DRAWINNER((drawInnerGeneric<4, 2, SCALE_THRESHOLD>));
|
||||
DRAWINNER((drawInnerGeneric<4, 2, true>));
|
||||
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
|
||||
DRAWINNER((drawInnerGeneric<2, 4, SCALE_THRESHOLD>));
|
||||
DRAWINNER((drawInnerGeneric<2, 4, true>));
|
||||
}
|
||||
} else {
|
||||
if (sameFormat) {
|
||||
switch (format.bytesPerPixel) {
|
||||
case 1: DRAWINNER(drawInner1Bpp<SCALE_THRESHOLD>); return;
|
||||
case 2: DRAWINNER(drawInner2Bpp<SCALE_THRESHOLD>); return;
|
||||
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, SCALE_THRESHOLD>)); return;
|
||||
case 1: DRAWINNER(drawInner1Bpp<true>); return;
|
||||
case 2: DRAWINNER(drawInner2Bpp<true>); return;
|
||||
case 4: DRAWINNER((drawInner4BppWithConv<4, 4, true>)); return;
|
||||
}
|
||||
} else if (format.bytesPerPixel == 4 && src.format.bytesPerPixel == 2) {
|
||||
DRAWINNER((drawInner4BppWithConv<4, 2, SCALE_THRESHOLD>));
|
||||
DRAWINNER((drawInner4BppWithConv<4, 2, true>));
|
||||
return;
|
||||
} else if (format.bytesPerPixel == 2 && src.format.bytesPerPixel == 4) {
|
||||
DRAWINNER((drawInner4BppWithConv<2, 4, SCALE_THRESHOLD>));
|
||||
DRAWINNER((drawInner4BppWithConv<2, 4, true>));
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (format.bytesPerPixel == 4) // src.bytesPerPixel must be 1 here
|
||||
DRAWINNER((drawInnerGeneric<4, 1, SCALE_THRESHOLD>));
|
||||
DRAWINNER((drawInnerGeneric<4, 1, true>));
|
||||
else
|
||||
DRAWINNER((drawInnerGeneric<2, 1, SCALE_THRESHOLD>));
|
||||
DRAWINNER((drawInnerGeneric<2, 1, true>));
|
||||
#undef DRAWINNER
|
||||
}
|
||||
void BITMAP::blendPixel(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool useTint, byte *destVal) const {
|
||||
|
@ -265,6 +265,8 @@ public:
|
||||
// kTintBlenderMode and kTintLightBlenderMode
|
||||
void blendTintSprite(uint8 aSrc, uint8 rSrc, uint8 gSrc, uint8 bSrc, uint8 &aDest, uint8 &rDest, uint8 &gDest, uint8 &bDest, uint32 alpha, bool light) const;
|
||||
|
||||
constexpr static int SCALE_THRESHOLD_BITS = 8;
|
||||
constexpr static int SCALE_THRESHOLD = 1 << SCALE_THRESHOLD_BITS;
|
||||
struct DrawInnerArgs {
|
||||
bool useTint, sameFormat, horizFlip, vertFlip, skipTrans, doScale;
|
||||
int xStart, yStart, srcAlpha, tintRed, tintGreen, tintBlue, scaleX, scaleY;
|
||||
@ -280,13 +282,13 @@ public:
|
||||
DrawInnerArgs(int yStart, int xStart, uint32 transColor, uint32 alphaMask, PALETTE palette, int useTint, int sameFormat, const ::Graphics::ManagedSurface &src, ::Graphics::Surface &destArea, int horizFlip, int vertFlip, int skipTrans, int srcAlpha, int tintRed, int tintGreen, int tintBlue, const Common::Rect &dstRect, const Common::Rect &srcArea, const BlenderMode blenderMode, int scaleX, int scaleY);
|
||||
};
|
||||
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
|
||||
void drawInner4BppWithConv(DrawInnerArgs &args);
|
||||
template<int ScaleThreshold>
|
||||
template<bool Scale>
|
||||
void drawInner2Bpp(DrawInnerArgs &args);
|
||||
template<int ScaleThreshold>
|
||||
template<bool Scale>
|
||||
void drawInner1Bpp(DrawInnerArgs &args);
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
|
||||
void drawInnerGeneric(DrawInnerArgs &args);
|
||||
|
||||
inline uint32 getColor(const byte *data, byte bpp) const {
|
||||
|
@ -13,7 +13,7 @@
|
||||
namespace AGS3 {
|
||||
|
||||
// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
|
||||
void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
const int xDir = args.horizFlip ? -1 : 1;
|
||||
byte rSrc, gSrc, bSrc, aSrc;
|
||||
@ -45,13 +45,13 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
args.xStart = 0;
|
||||
}
|
||||
int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
|
||||
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
|
||||
if (Scale) yCtrHeight = args.dstRect.height();
|
||||
if (args.yStart < 0) {
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) {
|
||||
@ -65,8 +65,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
uint32x4_t xCtrWidthSIMD = vdupq_n_u32(xCtrWidth); // This is the width of the row
|
||||
|
||||
if (ScaleThreshold == 0) {
|
||||
// If we are not scaling the image
|
||||
if (!Scale) {
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
|
||||
byte *destPtr = &destP[destX * DestBytesPerPixel];
|
||||
// Skip pixels that are beyond the row
|
||||
@ -78,7 +77,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
} else {
|
||||
// Here we are scaling the image
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
// Since the source yctr might not update every row of the destination, we have
|
||||
// to see if we are on a new row...
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
@ -94,12 +93,8 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
|
||||
if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
|
||||
uint32x4_t indexes = vdupq_n_u32(scaleXCtr);
|
||||
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
|
||||
// Calculate in parallel the indexes of the pixels
|
||||
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), SrcBytesPerPixel);
|
||||
#else
|
||||
#error Change code to allow different scale threshold!
|
||||
#endif
|
||||
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), SrcBytesPerPixel);
|
||||
// Simply memcpy them in. memcpy has no real performance overhead here
|
||||
memcpy(&srcBuffer[0*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 0), SrcBytesPerPixel);
|
||||
memcpy(&srcBuffer[1*(uintptr_t)SrcBytesPerPixel], srcP + vgetq_lane_u32(indexes, 1), SrcBytesPerPixel);
|
||||
@ -132,7 +127,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
// Drawing the last few not scaled pixels here.
|
||||
// Same as the loop above but now we check if we are going to overflow,
|
||||
// and thus we don't need to mask out pixels that go over the row.
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
|
||||
byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
|
||||
drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u32(0));
|
||||
@ -150,8 +145,8 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
// For the last 4 pixels, we just do them in serial, nothing special
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
|
||||
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
|
||||
if (ScaleThreshold != 0) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
|
||||
if (Scale) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
|
||||
}
|
||||
byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
|
||||
uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
|
||||
@ -184,7 +179,7 @@ void BITMAP::drawInner4BppWithConv(BITMAP::DrawInnerArgs &args) {
|
||||
}
|
||||
}
|
||||
|
||||
template<int ScaleThreshold>
|
||||
template<bool Scale>
|
||||
void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
const int xDir = args.horizFlip ? -1 : 1;
|
||||
byte rSrc, gSrc, bSrc, aSrc;
|
||||
@ -213,13 +208,13 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
args.xStart = 0;
|
||||
}
|
||||
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
|
||||
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
|
||||
if (Scale) yCtrHeight = args.dstRect.height();
|
||||
if (args.yStart < 0) {
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) {
|
||||
@ -232,7 +227,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
uint16x8_t xCtrWidthSIMD = vmovq_n_u16(xCtrWidth); // This is the width of the row
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
// If we are not scaling the image
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
|
||||
byte *destPtr = &destP[destX * 2];
|
||||
@ -245,7 +240,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
} else {
|
||||
// Here we are scaling the image
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
// Since the source yctr might not update every row of the destination, we have
|
||||
// to see if we are on a new row...
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
@ -261,13 +256,9 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
|
||||
if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
|
||||
uint32x4_t indexes = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
|
||||
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
|
||||
// Calculate in parallel the indexes of the pixels
|
||||
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), 8), 2);
|
||||
indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8), 2);
|
||||
#else
|
||||
#error Change code to allow different scale threshold!
|
||||
#endif
|
||||
indexes = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
|
||||
indexes2 = vmulq_n_u32(vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 2);
|
||||
// Simply memcpy them in. memcpy has no real performance overhead here
|
||||
srcBuffer[0] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 0));
|
||||
srcBuffer[1] = *(const uint16 *)(srcP + vgetq_lane_u32(indexes, 1));
|
||||
@ -304,7 +295,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
// Drawing the last few not scaled pixels here.
|
||||
// Same as the loop above but now we check if we are going to overflow,
|
||||
// and thus we don't need to mask out pixels that go over the row.
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
|
||||
byte *destPtr = &destP[destX * 2];
|
||||
drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, vmovq_n_u16(0));
|
||||
@ -322,8 +313,8 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
// For the last 4 pixels, we just do them in serial, nothing special
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
|
||||
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
|
||||
if (ScaleThreshold != 0) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
|
||||
if (Scale) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
|
||||
}
|
||||
byte *destVal = (byte *)&destP[destX * 2];
|
||||
uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
|
||||
@ -355,7 +346,7 @@ void BITMAP::drawInner2Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
}
|
||||
}
|
||||
|
||||
template<int ScaleThreshold>
|
||||
template<bool Scale>
|
||||
void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
const int xDir = args.horizFlip ? -1 : 1;
|
||||
uint8x16_t transColors = vld1q_dup_u8(&args.transColor);
|
||||
@ -377,13 +368,13 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
args.xStart = 0;
|
||||
}
|
||||
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
|
||||
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
|
||||
if (Scale) yCtrHeight = args.dstRect.height();
|
||||
if (args.yStart < 0) {
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) {
|
||||
@ -395,10 +386,10 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
|
||||
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
// So here we update the srcYCtr differently due to this being for
|
||||
// scaling
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
// Since the source yctr might not update every row of the destination, we have
|
||||
// to see if we are on a new row...
|
||||
@ -415,18 +406,14 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
// can't have any blending applied to them
|
||||
uint8x16_t destCols = vld1q_u8(destPtr);
|
||||
uint8x16_t srcCols = vld1q_u8(srcP + xDir * xCtr);
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
// If we are scaling, we have to set each pixel individually
|
||||
uint32x4_t indexes1 = vdupq_n_u32(scaleXCtr), indexes2 = vdupq_n_u32(scaleXCtr);
|
||||
uint32x4_t indexes3 = vdupq_n_u32(scaleXCtr), indexes4 = vdupq_n_u32(scaleXCtr);
|
||||
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
|
||||
indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), 8);
|
||||
indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), 8);
|
||||
indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), 8);
|
||||
indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), 8);
|
||||
#else
|
||||
#error Change code to allow different scale threshold!
|
||||
#endif
|
||||
indexes1 = vshrq_n_u32(vaddq_u32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
|
||||
indexes2 = vshrq_n_u32(vaddq_u32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
|
||||
indexes3 = vshrq_n_u32(vaddq_u32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
|
||||
indexes4 = vshrq_n_u32(vaddq_u32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
|
||||
srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 0)], srcCols, 0);
|
||||
srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 1)], srcCols, 1);
|
||||
srcCols = vsetq_lane_u8(srcP[vgetq_lane_u32(indexes1, 2)], srcCols, 2);
|
||||
@ -462,8 +449,8 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
if (args.horizFlip) srcP += 15;
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
|
||||
const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
|
||||
if (ScaleThreshold != 0) {
|
||||
srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
|
||||
if (Scale) {
|
||||
srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
|
||||
}
|
||||
// Check if this is a transparent color we should skip
|
||||
if (args.skipTrans && *srcCol == args.transColor)
|
||||
@ -475,21 +462,21 @@ void BITMAP::drawInner1Bpp(BITMAP::DrawInnerArgs &args) {
|
||||
if (args.horizFlip) srcP -= 15; // Undo what we did up there
|
||||
destP += args.destArea.pitch; // Goto next row
|
||||
// Only advance the src row by 1 every time like this if we don't scale
|
||||
if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template void BITMAP::drawInner4BppWithConv<4, 4, 0>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 4, 0x100>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 2, 0>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 2, 0x100>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<2, 4, 0>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<2, 4, 0x100>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner2Bpp<0>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner2Bpp<0x100>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner1Bpp<0>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner1Bpp<0x100>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 4, false>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 4, true>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 2, false>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<4, 2, true>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<2, 4, false>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner4BppWithConv<2, 4, true>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner2Bpp<false>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner2Bpp<true>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner1Bpp<false>(DrawInnerArgs &args);
|
||||
template void BITMAP::drawInner1Bpp<true>(DrawInnerArgs &args);
|
||||
|
||||
} // namespace AGS3
|
||||
|
||||
|
@ -25,7 +25,7 @@ inline uint32 extract32_idx3(__m128i x) {
|
||||
}
|
||||
|
||||
// This template handles 2bpp and 4bpp, the other specializations handle 1bpp and format conversion blits
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, int ScaleThreshold>
|
||||
template<int DestBytesPerPixel, int SrcBytesPerPixel, bool Scale>
|
||||
void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
const int xDir = args.horizFlip ? -1 : 1;
|
||||
byte rSrc, gSrc, bSrc, aSrc;
|
||||
@ -57,13 +57,13 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
args.xStart = 0;
|
||||
}
|
||||
int destY = args.yStart, srcYCtr = 0, yCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 4 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
|
||||
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
|
||||
if (Scale) yCtrHeight = args.dstRect.height();
|
||||
if (args.yStart < 0) {
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) {
|
||||
@ -77,7 +77,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
__m128i xCtrWidthSIMD = _mm_set1_epi32(xCtrWidth); // This is the width of the row
|
||||
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
// If we are not scaling the image
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
|
||||
byte *destPtr = &destP[destX * DestBytesPerPixel];
|
||||
@ -90,7 +90,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
} else {
|
||||
// Here we are scaling the image
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
// Since the source yctr might not update every row of the destination, we have
|
||||
// to see if we are on a new row...
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
@ -106,15 +106,11 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
|
||||
if (yCtr + 1 == yCtrHeight && xCtr + 4 > xCtrWidth) break; // Don't go past the last 4 pixels
|
||||
__m128i indexes = _mm_set1_epi32(scaleXCtr);
|
||||
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
|
||||
// Calculate in parallel the indexes of the pixels
|
||||
if (SrcBytesPerPixel == 4)
|
||||
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 2);
|
||||
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 2);
|
||||
else
|
||||
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
|
||||
#else
|
||||
#error Change code to allow different scale threshold!
|
||||
#endif
|
||||
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
|
||||
// Simply memcpy them in. memcpy has no real performance overhead here
|
||||
memcpy(&srcBuffer[0*(size_t)SrcBytesPerPixel], srcP + extract32_idx0(indexes), SrcBytesPerPixel);
|
||||
memcpy(&srcBuffer[1*(size_t)SrcBytesPerPixel], srcP + extract32_idx1(indexes), SrcBytesPerPixel);
|
||||
@ -147,7 +143,7 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
// Drawing the last few not scaled pixels here.
|
||||
// Same as the loop above but now we check if we are going to overflow,
|
||||
// and thus we don't need to mask out pixels that go over the row.
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
for (; xCtr + 4 < xCtrWidth; destX += 4, xCtr += 4, xCtrBpp += SrcBytesPerPixel*4) {
|
||||
byte *destPtr = &destP[(ptrdiff_t)destX * DestBytesPerPixel];
|
||||
drawPixelSIMD<DestBytesPerPixel, SrcBytesPerPixel>(destPtr, srcP, tint, alphas, maskedAlphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
|
||||
@ -165,8 +161,8 @@ void BITMAP::drawInner4BppWithConv(DrawInnerArgs &args) {
|
||||
// For the last 4 pixels, we just do them in serial, nothing special
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += SrcBytesPerPixel) {
|
||||
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
|
||||
if (ScaleThreshold != 0) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * SrcBytesPerPixel);
|
||||
if (Scale) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * SrcBytesPerPixel);
|
||||
}
|
||||
byte *destVal = (byte *)&destP[destX * DestBytesPerPixel];
|
||||
uint32 srcCol = getColor(srcColPtr, SrcBytesPerPixel);
|
||||
@ -228,13 +224,13 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
args.xStart = 0;
|
||||
}
|
||||
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = (xCtrWidth % 8 == 0) ? args.dstRect.height() : (args.dstRect.height() - 1);
|
||||
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
|
||||
if (Scale) yCtrHeight = args.dstRect.height();
|
||||
if (args.yStart < 0) {
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) {
|
||||
@ -247,7 +243,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
__m128i xCtrWidthSIMD = _mm_set1_epi16(xCtrWidth); // This is the width of the row
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
// If we are not scaling the image
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
|
||||
byte *destPtr = &destP[destX * 2];
|
||||
@ -260,7 +256,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
} else {
|
||||
// Here we are scaling the image
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
// Since the source yctr might not update every row of the destination, we have
|
||||
// to see if we are on a new row...
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
@ -276,13 +272,9 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
for (int xCtr = xCtrStart, xCtrBpp = xCtrBppStart, destX = args.xStart, scaleXCtr = xCtrStart * args.scaleX; xCtr < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
|
||||
if (yCtr + 1 == yCtrHeight && xCtr + 8 > xCtrWidth) break;
|
||||
__m128i indexes = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
|
||||
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
|
||||
// Calculate in parallel the indexes of the pixels
|
||||
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), 8), 1);
|
||||
indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8), 1);
|
||||
#else
|
||||
#error Change code to allow different scale threshold!
|
||||
#endif
|
||||
indexes = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes, scaleAdds), SCALE_THRESHOLD_BITS), 1);
|
||||
indexes2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS), 1);
|
||||
// Simply memcpy them in. memcpy has no real performance overhead here
|
||||
srcBuffer[0] = *(const uint16 *)(srcP + extract32_idx0(indexes));
|
||||
srcBuffer[1] = *(const uint16 *)(srcP + extract32_idx1(indexes));
|
||||
@ -319,7 +311,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
// Drawing the last few not scaled pixels here.
|
||||
// Same as the loop above but now we check if we are going to overflow,
|
||||
// and thus we don't need to mask out pixels that go over the row.
|
||||
if (ScaleThreshold == 0) {
|
||||
if (!Scale) {
|
||||
for (; xCtr + 8 < xCtrWidth; destX += 8, xCtr += 8, xCtrBpp += 16) {
|
||||
byte *destPtr = &destP[destX * 2];
|
||||
drawPixelSIMD2Bpp(destPtr, srcP, tint, alphas, transColors, xDir, xCtrBpp, args.srcAlpha, args.skipTrans, args.horizFlip, args.useTint, _mm_setzero_si128());
|
||||
@ -337,8 +329,8 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
// For the last 4 pixels, we just do them in serial, nothing special
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, xCtrBpp += 2) {
|
||||
const byte *srcColPtr = (const byte *)(srcP + xDir * xCtrBpp);
|
||||
if (ScaleThreshold != 0) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / ScaleThreshold * 2);
|
||||
if (Scale) {
|
||||
srcColPtr = (const byte *)(srcP + (xCtr * args.scaleX) / SCALE_THRESHOLD * 2);
|
||||
}
|
||||
byte *destVal = (byte *)&destP[destX * 2];
|
||||
uint32 srcCol = (uint32)(*(const uint16 *)srcColPtr);
|
||||
@ -370,7 +362,7 @@ void BITMAP::drawInner2Bpp(DrawInnerArgs &args) {
|
||||
}
|
||||
}
|
||||
|
||||
template<int ScaleThreshold>
|
||||
template<bool Scale>
|
||||
void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
|
||||
const int xDir = args.horizFlip ? -1 : 1;
|
||||
__m128i transColors = _mm_set1_epi16(args.transColor | (args.transColor << 8));
|
||||
@ -392,13 +384,13 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
|
||||
args.xStart = 0;
|
||||
}
|
||||
int destY = args.yStart, yCtr = 0, srcYCtr = 0, scaleYCtr = 0, yCtrHeight = args.dstRect.height();
|
||||
if (ScaleThreshold != 0) yCtrHeight = args.dstRect.height();
|
||||
if (Scale) yCtrHeight = args.dstRect.height();
|
||||
if (args.yStart < 0) {
|
||||
yCtr = -args.yStart;
|
||||
destY = 0;
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
scaleYCtr = yCtr * args.scaleY;
|
||||
srcYCtr = scaleYCtr / ScaleThreshold;
|
||||
srcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
}
|
||||
}
|
||||
if (args.yStart + yCtrHeight > args.destArea.h) {
|
||||
@ -410,10 +402,10 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
|
||||
args.horizFlip ? args.srcArea.right - 16 : args.srcArea.left,
|
||||
args.vertFlip ? args.srcArea.bottom - 1 - yCtr : args.srcArea.top + yCtr);
|
||||
for (; yCtr < yCtrHeight; ++destY, ++yCtr, scaleYCtr += args.scaleY) {
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
// So here we update the srcYCtr differently due to this being for
|
||||
// scaling
|
||||
int newSrcYCtr = scaleYCtr / ScaleThreshold;
|
||||
int newSrcYCtr = scaleYCtr / SCALE_THRESHOLD;
|
||||
if (srcYCtr != newSrcYCtr) {
|
||||
// Since the source yctr might not update every row of the destination, we have
|
||||
// to see if we are on a new row...
|
||||
@ -430,19 +422,15 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
|
||||
// can't have any blending applied to them
|
||||
__m128i destCols = _mm_loadu_si128((const __m128i *)destPtr);
|
||||
__m128i srcCols = _mm_loadu_si128((const __m128i *)(srcP + xDir * xCtr));
|
||||
if (ScaleThreshold != 0) {
|
||||
if (Scale) {
|
||||
// If we are scaling, we have to set each pixel individually
|
||||
__m128i indexes1 = _mm_set1_epi32(scaleXCtr), indexes2 = _mm_set1_epi32(scaleXCtr);
|
||||
__m128i indexes3 = _mm_set1_epi32(scaleXCtr), indexes4 = _mm_set1_epi32(scaleXCtr);
|
||||
#if (ScaleThreshold == 0 || ScaleThreshold == 0x100)
|
||||
// Calculate in parallel the indexes of the pixels
|
||||
indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), 8);
|
||||
indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), 8);
|
||||
indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), 8);
|
||||
indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), 8);
|
||||
#else
|
||||
#error Change code to allow different scale threshold!
|
||||
#endif
|
||||
indexes1 = _mm_srli_epi32(_mm_add_epi32(indexes1, scaleAdds1), SCALE_THRESHOLD_BITS);
|
||||
indexes2 = _mm_srli_epi32(_mm_add_epi32(indexes2, scaleAdds2), SCALE_THRESHOLD_BITS);
|
||||
indexes3 = _mm_srli_epi32(_mm_add_epi32(indexes3, scaleAdds3), SCALE_THRESHOLD_BITS);
|
||||
indexes4 = _mm_srli_epi32(_mm_add_epi32(indexes4, scaleAdds4), SCALE_THRESHOLD_BITS);
|
||||
srcCols = _mm_set_epi8(
|
||||
srcP[extract32_idx3(indexes4)],
|
||||
srcP[extract32_idx2(indexes4)],
|
||||
@ -482,8 +470,8 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
|
||||
if (args.horizFlip) srcP += 15;
|
||||
for (; xCtr < xCtrWidth; ++destX, ++xCtr, scaleXCtr += args.scaleX) {
|
||||
const byte *srcCol = (const byte *)(srcP + xDir * xCtr);
|
||||
if (ScaleThreshold != 0) {
|
||||
srcCol = (const byte *)(srcP + scaleXCtr / ScaleThreshold);
|
||||
if (Scale) {
|
||||
srcCol = (const byte *)(srcP + scaleXCtr / SCALE_THRESHOLD);
|
||||
}
|
||||
// Check if this is a transparent color we should skip
|
||||
if (args.skipTrans && *srcCol == args.transColor)
|
||||
@ -495,7 +483,7 @@ void BITMAP::drawInner1Bpp(DrawInnerArgs &args) {
|
||||
if (args.horizFlip) srcP -= 15; // Undo what we did up there
|
||||
destP += args.destArea.pitch; // Goto next row
|
||||
// Only advance the src row by 1 every time like this if we don't scale
|
||||
if (ScaleThreshold == 0) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
if (!Scale) srcP += args.vertFlip ? -args.src.pitch : args.src.pitch;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user