mirror of
https://github.com/libretro/scummvm.git
synced 2025-02-24 05:01:43 +00:00
apply patch by Fingolfin to optimize scalers + more
svn-id: r36338
This commit is contained in:
parent
c228259fff
commit
493eb0c5a4
@ -25,40 +25,8 @@
|
||||
#include "graphics/scaler/intern.h"
|
||||
#include "CEScaler.h"
|
||||
|
||||
int redblueMasks[] = { 0x7C1F, 0xF81F };
|
||||
int greenMasks[] = { 0x03E0, 0x07E0 };
|
||||
|
||||
static int maskUsed;
|
||||
|
||||
void initCEScaler(void) {
|
||||
if (gBitFormat == 555)
|
||||
maskUsed = 0;
|
||||
else
|
||||
maskUsed = 1;
|
||||
}
|
||||
|
||||
// FIXME: Fingolfin says: The following interpolation code is a lot slower than it needs
|
||||
// to be. The reason: Using the value of a global variable to index two global arrays is
|
||||
// extremly difficult if not impossible for the compiler to optimize. At the very least,
|
||||
// the two arrays should be 'static const', but even then, memory access is required.
|
||||
// To avoid this, one could use the techniques used by our other scalers. See also the
|
||||
// interpolate functions in graphics/scaler/intern.h.
|
||||
// Even if those can't be used directly for some reasons (e.g. the compiler has problems
|
||||
// with templates), then still the *techniques* could and should be used. I would exepct
|
||||
// that this way, even the C version of PocketPCPortrait() should get a big speed boost.
|
||||
|
||||
static inline uint16 CEinterpolate16_4(uint16 p1, uint16 p2, uint16 p3, uint16 p4)
|
||||
{
|
||||
return ((((p1 & redblueMasks[maskUsed]) + (p2 & redblueMasks[maskUsed]) + (p3 & redblueMasks[maskUsed]) + (p4 & redblueMasks[maskUsed])) / 4) & redblueMasks[maskUsed]) |
|
||||
((((p1 & greenMasks[maskUsed]) + (p2 & greenMasks[maskUsed]) + (p3 & greenMasks[maskUsed]) + (p4 & greenMasks[maskUsed])) / 4) & greenMasks[maskUsed]);
|
||||
}
|
||||
|
||||
static inline uint16 CEinterpolate16_2(uint16 p1, int w1, uint16 p2, int w2) {
|
||||
return ((((p1 & redblueMasks[maskUsed]) * w1 + (p2 & redblueMasks[maskUsed]) * w2) / (w1 + w2)) & redblueMasks[maskUsed]) |
|
||||
((((p1 & greenMasks[maskUsed]) * w1 + (p2 & greenMasks[maskUsed]) * w2) / (w1 + w2)) & greenMasks[maskUsed]);
|
||||
}
|
||||
|
||||
void PocketPCPortrait(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
template<int bitFormat>
|
||||
void PocketPCPortraitTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
uint8 *work;
|
||||
int i;
|
||||
|
||||
@ -73,9 +41,9 @@ void PocketPCPortrait(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3
|
||||
uint16 color3 = *(((const uint16 *)srcPtr) + (i + 2));
|
||||
uint16 color4 = *(((const uint16 *)srcPtr) + (i + 3));
|
||||
|
||||
*(((uint16 *)work) + 0) = CEinterpolate16_2(color1, 3, color2, 1);
|
||||
*(((uint16 *)work) + 1) = CEinterpolate16_2(color2, 1, color3, 1);
|
||||
*(((uint16 *)work) + 2) = CEinterpolate16_2(color3, 1, color4, 3);
|
||||
*(((uint16 *)work) + 0) = interpolate32_3_1<bitFormat>(color1, color2);
|
||||
*(((uint16 *)work) + 1) = interpolate32_1_1<bitFormat>(color2, color3);
|
||||
*(((uint16 *)work) + 2) = interpolate32_3_1<bitFormat>(color4, color3);
|
||||
|
||||
work += 3 * sizeof(uint16);
|
||||
}
|
||||
@ -83,61 +51,66 @@ void PocketPCPortrait(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3
|
||||
dstPtr += dstPitch;
|
||||
}
|
||||
}
|
||||
MAKE_WRAPPER(PocketPCPortrait)
|
||||
|
||||
// FIXME: Fingolfin says: Please document this function. What does it compute? How
|
||||
// does it differ from the code in aspect.cpp ? It would be nice to speed up this function
|
||||
// here using the ideas and tracks from aspect.cpp and the comment above, as right now, it
|
||||
// is rather hard for the compiler to optimize this code properly.
|
||||
// Our version of an aspect scaler. Main difference is the out-of-place
|
||||
// operation, omitting a straight blit step the sdl backend does. Also,
|
||||
// tests show unaligned access errors with the stock aspect scaler.
|
||||
void PocketPCLandscapeAspect(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
|
||||
#define RB(x) ((x & redblueMasks[maskUsed])<<8)
|
||||
#define G(x) ((x & greenMasks[maskUsed])<<3)
|
||||
const int redblueMasks[] = { 0x7C1F, 0xF81F };
|
||||
const int greenMasks[] = { 0x03E0, 0x07E0 };
|
||||
const int RBM = redblueMasks[gBitFormat == 565];
|
||||
const int GM = greenMasks[gBitFormat == 565];
|
||||
|
||||
int i,j;
|
||||
unsigned int p1, p2;
|
||||
uint8 *inbuf, *outbuf, *instart, *outstart;
|
||||
|
||||
#define RB(x) ((x & RBM)<<8)
|
||||
#define G(x) ((x & GM)<<3)
|
||||
|
||||
#define P20(x) (((x)>>2)-((x)>>4))
|
||||
#define P40(x) (((x)>>1)-((x)>>3))
|
||||
#define P60(x) (((x)>>1)+((x)>>3))
|
||||
#define P80(x) (((x)>>1)+((x)>>2)+((x)>>4))
|
||||
|
||||
#define MAKEPIXEL(rb,g) ((((rb)>>8) & redblueMasks[maskUsed] | ((g)>>3) & greenMasks[maskUsed]))
|
||||
#define MAKEPIXEL(rb,g) ((((rb)>>8) & RBM | ((g)>>3) & GM))
|
||||
|
||||
int i,j;
|
||||
unsigned int p1;
|
||||
unsigned int p2;
|
||||
uint16 * inbuf;
|
||||
uint16 * outbuf;
|
||||
inbuf = (uint16 *)srcPtr;
|
||||
outbuf = (uint16 *)dstPtr;
|
||||
inbuf = (uint8 *)srcPtr;
|
||||
outbuf = (uint8 *)dstPtr;
|
||||
height /= 5;
|
||||
|
||||
uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16));
|
||||
uint16 dstPitch16 = (uint16)(dstPitch / sizeof(uint16));
|
||||
|
||||
for (i = 0; i < height/5; i++) {
|
||||
for (i = 0; i < height; i++) {
|
||||
instart = inbuf;
|
||||
outstart = outbuf;
|
||||
for (j=0; j < width; j++) {
|
||||
p1 = *((uint16*)inbuf+j); inbuf += srcPitch16;
|
||||
*((uint16*)outbuf+j) = p1; outbuf += dstPitch16;
|
||||
|
||||
p2 = *((uint16*)inbuf+j); inbuf += srcPitch16;
|
||||
*((uint16*)outbuf+j) = MAKEPIXEL(P20(RB(p1))+P80(RB(p2)),P20(G(p1))+P80(G(p2))); outbuf += dstPitch16;
|
||||
p1 = *(uint16*)inbuf; inbuf += srcPitch;
|
||||
*(uint16*)outbuf = p1; outbuf += dstPitch;
|
||||
|
||||
p2 = *(uint16*)inbuf; inbuf += srcPitch;
|
||||
*(uint16*)outbuf = MAKEPIXEL(P20(RB(p1))+P80(RB(p2)),P20(G(p1))+P80(G(p2))); outbuf += dstPitch;
|
||||
|
||||
p1 = p2;
|
||||
p2 = *((uint16*)inbuf+j); inbuf += srcPitch16;
|
||||
*((uint16*)outbuf+j) = MAKEPIXEL(P40(RB(p1))+P60(RB(p2)),P40(G(p1))+P60(G(p2))); outbuf += dstPitch16;
|
||||
p2 = *(uint16*)inbuf; inbuf += srcPitch;
|
||||
*(uint16*)outbuf = MAKEPIXEL(P40(RB(p1))+P60(RB(p2)),P40(G(p1))+P60(G(p2))); outbuf += dstPitch;
|
||||
|
||||
p1 = p2;
|
||||
p2 = *((uint16*)inbuf+j); inbuf += srcPitch16;
|
||||
*((uint16*)outbuf+j) = MAKEPIXEL(P60(RB(p1))+P40(RB(p2)),P60(G(p1))+P40(G(p2))); outbuf += dstPitch16;
|
||||
p2 = *(uint16*)inbuf; inbuf += srcPitch;
|
||||
*(uint16*)outbuf = MAKEPIXEL(P60(RB(p1))+P40(RB(p2)),P60(G(p1))+P40(G(p2))); outbuf += dstPitch;
|
||||
|
||||
p1 = p2;
|
||||
p2 = *((uint16*)inbuf+j);
|
||||
*((uint16*)outbuf+j) = MAKEPIXEL(P80(RB(p1))+P20(RB(p2)),P80(G(p1))+P20(G(p2))); outbuf += dstPitch16;
|
||||
p2 = *(uint16*)inbuf;
|
||||
*(uint16*)outbuf = MAKEPIXEL(P80(RB(p1))+P20(RB(p2)),P80(G(p1))+P20(G(p2))); outbuf += dstPitch;
|
||||
|
||||
*((uint16*)outbuf+j) = p2;
|
||||
*(uint16*)outbuf = p2;
|
||||
|
||||
inbuf = inbuf - srcPitch16*4;
|
||||
outbuf = outbuf - dstPitch16*5;
|
||||
inbuf = inbuf - srcPitch*4 + sizeof(uint16);
|
||||
outbuf = outbuf - dstPitch*5 + sizeof(uint16);
|
||||
}
|
||||
inbuf = inbuf + srcPitch16*5;
|
||||
outbuf = outbuf + dstPitch16*6;
|
||||
inbuf = instart + srcPitch*5;
|
||||
outbuf = outstart + dstPitch*6;
|
||||
}
|
||||
}
|
||||
|
||||
@ -150,10 +123,8 @@ extern "C" {
|
||||
}
|
||||
#endif
|
||||
|
||||
void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
#ifdef ARM
|
||||
PocketPCHalfARM(srcPtr, srcPitch, dstPtr, dstPitch, width, height, redbluegreenMasks[maskUsed],roundingconstants[maskUsed]);
|
||||
#else
|
||||
template<int bitFormat>
|
||||
void PocketPCHalfTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
uint8 *work;
|
||||
int i;
|
||||
uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16));
|
||||
@ -168,18 +139,29 @@ void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 ds
|
||||
uint16 color2 = *(((const uint16 *)srcPtr) + (i + 1));
|
||||
uint16 color3 = *(((const uint16 *)srcPtr) + (i + srcPitch16));
|
||||
uint16 color4 = *(((const uint16 *)srcPtr) + (i + srcPitch16 + 1));
|
||||
*(((uint16 *)work) + 0) = CEinterpolate16_4(color1, color2, color3, color4);
|
||||
*(((uint16 *)work) + 0) = interpolate16_1_1_1_1<bitFormat>(color1, color2, color3, color4);
|
||||
|
||||
work += sizeof(uint16);
|
||||
}
|
||||
srcPtr += 2 * srcPitch;
|
||||
dstPtr += dstPitch;
|
||||
}
|
||||
}
|
||||
|
||||
void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
#ifdef ARM
|
||||
int maskUsed = (gBitFormat == 565);
|
||||
PocketPCHalfARM(srcPtr, srcPitch, dstPtr, dstPitch, width, height, redbluegreenMasks[maskUsed],roundingconstants[maskUsed]);
|
||||
#else
|
||||
if (gBitFormat == 565)
|
||||
PocketPCHalfTemplate<565>(srcPtr, srcPitch, dstPtr, dstPitch, width, height);
|
||||
else
|
||||
PocketPCHalfTemplate<565>(srcPtr, srcPitch, dstPtr, dstPitch, width, height);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void PocketPCHalfZoom(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
template<int bitFormat>
|
||||
void PocketPCHalfZoomTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
uint8 *work;
|
||||
int i;
|
||||
uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16));
|
||||
@ -191,10 +173,10 @@ void PocketPCHalfZoom(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3
|
||||
i = 0;
|
||||
work = dstPtr;
|
||||
|
||||
for (int i=0; i<width; i+=2) {
|
||||
for (int i = 0; i < width; i += 2) {
|
||||
uint16 color1 = *(((const uint16 *)srcPtr) + i);
|
||||
uint16 color2 = *(((const uint16 *)srcPtr) + (i + 1));
|
||||
*(((uint16 *)work) + 0) = CEinterpolate16_2(color1, 1, color2, 1);
|
||||
*(((uint16 *)work) + 0) = interpolate32_1_1<bitFormat>(color1, color2);
|
||||
|
||||
work += sizeof(uint16);
|
||||
}
|
||||
@ -202,8 +184,10 @@ void PocketPCHalfZoom(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint3
|
||||
dstPtr += dstPitch;
|
||||
}
|
||||
}
|
||||
MAKE_WRAPPER(PocketPCHalfZoom)
|
||||
|
||||
void SmartphoneLandscape(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
template<int bitFormat>
|
||||
void SmartphoneLandscapeTemplate(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
|
||||
uint8 *work;
|
||||
int i;
|
||||
int line = 0;
|
||||
@ -212,14 +196,14 @@ void SmartphoneLandscape(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, ui
|
||||
i = 0;
|
||||
work = dstPtr;
|
||||
|
||||
for (int i=0; i<width; i+=3) {
|
||||
for (int i = 0; i < width; i += 3) {
|
||||
// Filter 2/3
|
||||
uint16 color1 = *(((const uint16 *)srcPtr) + i);
|
||||
uint16 color2 = *(((const uint16 *)srcPtr) + (i + 1));
|
||||
uint16 color3 = *(((const uint16 *)srcPtr) + (i + 2));
|
||||
|
||||
*(((uint16 *)work) + 0) = CEinterpolate16_2(color1, 3, color2, 1);
|
||||
*(((uint16 *)work) + 1) = CEinterpolate16_2(color2, 1, color3, 1);
|
||||
*(((uint16 *)work) + 0) = interpolate32_3_1<bitFormat>(color1, color2);
|
||||
*(((uint16 *)work) + 1) = interpolate32_3_1<bitFormat>(color3, color2);
|
||||
|
||||
work += 2 * sizeof(uint16);
|
||||
}
|
||||
@ -233,3 +217,4 @@ void SmartphoneLandscape(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, ui
|
||||
}
|
||||
}
|
||||
}
|
||||
MAKE_WRAPPER(SmartphoneLandscape)
|
||||
|
@ -39,6 +39,4 @@ DECLARE_SCALER(PocketPCHalfZoom);
|
||||
DECLARE_SCALER(SmartphoneLandscape);
|
||||
//#endif
|
||||
|
||||
void initCEScaler(void);
|
||||
|
||||
#endif
|
||||
|
@ -1399,7 +1399,6 @@ bool OSystem_WINCE3::loadGFXMode() {
|
||||
InitScalers(555);
|
||||
else
|
||||
InitScalers(565);
|
||||
initCEScaler();
|
||||
_overlayFormat.bytesPerPixel = _hwscreen->format->BytesPerPixel;
|
||||
_overlayFormat.rLoss = _hwscreen->format->Rloss;
|
||||
_overlayFormat.gLoss = _hwscreen->format->Gloss;
|
||||
|
Loading…
x
Reference in New Issue
Block a user