mirror of
https://github.com/libretro/scummvm.git
synced 2024-11-30 21:00:39 +00:00
Removed use of LUT16to32 in HQx asm versions, replacing some MMX code with 'plain' x86 code. Advantage: got rid of a 256kb table (reduces cache load, so over here the code is about as fast as before; in particular, since the affected interpolators are not used that often, it seems). Moreover, the new code is more accurate than the old ASM code, which actually differed from what our C++ HQx did (sacrificing precision for speed, i.e., cheating ;-)
svn-id: r36078
This commit is contained in:
parent
80ba7ec844
commit
4098ff66aa
@ -53,17 +53,26 @@ extern "C" {
|
||||
|
||||
#if !defined(_WIN32) && !defined(MACOSX) && !defined(__OS2__)
|
||||
#define RGBtoYUV _RGBtoYUV
|
||||
#define LUT16to32 _LUT16to32
|
||||
#define hqx_highbits _hqx_highbits
|
||||
#define hqx_lowbits _hqx_lowbits
|
||||
#define hqx_low2bits _hqx_low2bits
|
||||
#define hqx_low3bits _hqx_low3bits
|
||||
#define hqx_greenMask _hqx_greenMask
|
||||
#define hqx_redBlueMask _hqx_redBlueMask
|
||||
#define hqx_green_redBlue_Mask _hqx_green_redBlue_Mask
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
uint32 hqx_highbits = 0xF7DEF7DE;
|
||||
uint32 hqx_lowbits = 0x0821;
|
||||
uint32 hqx_low2bits = 0x0C63;
|
||||
uint32 hqx_low3bits = 0x1CE7;
|
||||
uint32 hqx_greenMask = 0;
|
||||
uint32 hqx_redBlueMask = 0;
|
||||
uint32 hqx_green_redBlue_Mask = 0;
|
||||
|
||||
// FIXME/TODO: The following two tables suck up 512 KB. This is bad.
|
||||
// FIXME/TODO: The RGBtoYUV table sucks up 256 KB. This is bad.
|
||||
// In addition we never free them...
|
||||
//
|
||||
// Note: a memory lookup table is *not* necessarily faster than computing
|
||||
@ -72,14 +81,7 @@ uint32 hqx_lowbits = 0x0821;
|
||||
// systems, so main memory has to be accessed, which is about the worst thing
|
||||
// that can happen to code which tries to be fast...
|
||||
//
|
||||
// So we should think about ways to get these smaller / removed. The LUT16to32
|
||||
// is only used by the HQX asm right now; maybe somebody can modify the code
|
||||
// there to work w/o it (and do some benchmarking, too?). To do that, just
|
||||
// do the conversion on the fly, or even do w/o it (as the C++ code manages to),
|
||||
// by making different versions of the code based on gBitFormat (or by writing
|
||||
// bit masks into registers which are computed based on gBitFormat).
|
||||
//
|
||||
// RGBtoYUV is also used by the C(++) version of the HQX code. Maybe we can
|
||||
// So we should think about ways to get these smaller / removed. Maybe we can
|
||||
// use the same technique which is employed by our MPEG code to reduce the
|
||||
// size of the lookup tables at the cost of some additional computations? That
|
||||
// might actually result in a speedup, too, if done right (and the code code
|
||||
@ -89,7 +91,6 @@ uint32 hqx_lowbits = 0x0821;
|
||||
// differences are likely to vary a lot between different architectures and
|
||||
// CPUs.
|
||||
uint32 *RGBtoYUV = 0;
|
||||
uint32 *LUT16to32 = 0;
|
||||
}
|
||||
|
||||
void InitLUT(Graphics::PixelFormat format) {
|
||||
@ -101,18 +102,29 @@ void InitLUT(Graphics::PixelFormat format) {
|
||||
// Allocate the YUV/LUT buffers on the fly if needed.
|
||||
if (RGBtoYUV == 0)
|
||||
RGBtoYUV = (uint32 *)malloc(65536 * sizeof(uint32));
|
||||
if (LUT16to32 == 0)
|
||||
LUT16to32 = (uint32 *)malloc(65536 * sizeof(uint32));
|
||||
|
||||
for (int color = 0; color < 65536; ++color) {
|
||||
format.colorToRGB(color, r, g, b);
|
||||
LUT16to32[color] = (r << 16) | (g << 8) | b;
|
||||
|
||||
Y = (r + g + b) >> 2;
|
||||
u = 128 + ((r - b) >> 2);
|
||||
v = 128 + ((-r + 2 * g - b) >> 3);
|
||||
RGBtoYUV[color] = (Y << 16) | (u << 8) | v;
|
||||
}
|
||||
|
||||
#ifdef USE_NASM
|
||||
hqx_lowbits = (1 << format.rShift) | (1 << format.gShift) | (1 << format.bShift),
|
||||
hqx_low2bits = (3 << format.rShift) | (3 << format.gShift) | (3 << format.bShift),
|
||||
hqx_low3bits = (7 << format.rShift) | (7 << format.gShift) | (7 << format.bShift),
|
||||
|
||||
hqx_highbits = format.RGBToColor(255,255,255) ^ hqx_lowbits;
|
||||
|
||||
// FIXME: The following code only does the right thing
|
||||
// if the color order is RGB or BGR, i.e., green is in the middle.
|
||||
hqx_greenMask = format.RGBToColor(0,255,0);
|
||||
hqx_redBlueMask = format.RGBToColor(255,0,255);
|
||||
|
||||
hqx_green_redBlue_Mask = (hqx_greenMask << 16) | hqx_redBlueMask;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -121,24 +133,11 @@ void InitScalers(uint32 BitFormat) {
|
||||
gBitFormat = BitFormat;
|
||||
|
||||
#ifndef DISABLE_HQ_SCALERS
|
||||
#undef kHighBitsMask
|
||||
#undef kLowBitsMask
|
||||
|
||||
if (gBitFormat == 555) {
|
||||
InitLUT(Graphics::createPixelFormat<555>());
|
||||
#ifdef USE_NASM
|
||||
hqx_highbits = Graphics::ColorMasks<555>::kHighBitsMask;
|
||||
hqx_lowbits = Graphics::ColorMasks<555>::kLowBitsMask & 0xFFFF;
|
||||
#endif
|
||||
}
|
||||
if (gBitFormat == 565) {
|
||||
InitLUT(Graphics::createPixelFormat<565>());
|
||||
#ifdef USE_NASM
|
||||
// The uint32 cast here is needed to silence an MSVC warning
|
||||
// (warning C4245: '=': conversion from '' to 'uint32', signed/unsigned mismatch
|
||||
hqx_highbits = (uint32)Graphics::ColorMasks<565>::kHighBitsMask;
|
||||
hqx_lowbits = Graphics::ColorMasks<565>::kLowBitsMask & 0xFFFF;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -146,9 +145,7 @@ void InitScalers(uint32 BitFormat) {
|
||||
void DestroyScalers(){
|
||||
#ifndef DISABLE_HQ_SCALERS
|
||||
free(RGBtoYUV);
|
||||
free(LUT16to32);
|
||||
RGBtoYUV = 0;
|
||||
LUT16to32 = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -20,10 +20,14 @@
|
||||
|
||||
GLOBAL _hq2x_16
|
||||
|
||||
EXTERN _LUT16to32
|
||||
EXTERN _RGBtoYUV
|
||||
EXTERN _hqx_highbits
|
||||
EXTERN _hqx_lowbits
|
||||
EXTERN _hqx_low2bits
|
||||
EXTERN _hqx_low3bits
|
||||
EXTERN _hqx_greenMask
|
||||
EXTERN _hqx_redBlueMask
|
||||
EXTERN _hqx_green_redBlue_Mask
|
||||
|
||||
SECTION .bss
|
||||
linesleft resd 1
|
||||
@ -165,103 +169,186 @@ SECTION .text
|
||||
; interpolate16_3<bitFormat,5,2,1>
|
||||
; Mix three pixels with weight 5, 2, and 1, respectively: (c1*5+c2*2+c3)/8;
|
||||
%macro Interp6 3
|
||||
mov ecx, [_LUT16to32]
|
||||
movd mm1, [ecx+eax*4]
|
||||
mov edx, %2
|
||||
movd mm2, [ecx+edx*4]
|
||||
mov edx, %3
|
||||
movd mm3, [ecx+edx*4]
|
||||
punpcklbw mm1, [reg_blank]
|
||||
punpcklbw mm2, [reg_blank]
|
||||
punpcklbw mm3, [reg_blank]
|
||||
pmullw mm1, [const5]
|
||||
psllw mm2, 1
|
||||
paddw mm1, mm3
|
||||
paddw mm1, mm2
|
||||
psrlw mm1, 5
|
||||
packuswb mm1, [reg_blank]
|
||||
movd edx, mm1
|
||||
shl dl, 2
|
||||
shr edx, 1
|
||||
shl dx, 3
|
||||
shr edx, 5
|
||||
mov %1, dx
|
||||
; Unpack eax to ecx and multiply by 5
|
||||
mov eax, [w5]
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
; multiply c1 by 5
|
||||
;imul ecx, 5 ; imul works, too, but might be slower on older systems?
|
||||
mov edx, ecx
|
||||
shl ecx, 2
|
||||
add ecx, edx
|
||||
|
||||
; unpack c2 to edx
|
||||
mov eax, %2
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; add 2*c2 to c1*5
|
||||
add ecx, edx
|
||||
add ecx, edx
|
||||
|
||||
; unpack c3 to edx
|
||||
mov eax, %3
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; add c3 and 2*c2+c1*5, divide by 8, mask the result
|
||||
add edx, ecx
|
||||
shr edx, 3
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; finally, repack the mixed pixel
|
||||
mov ecx, edx
|
||||
shr ecx, 16
|
||||
or edx, ecx
|
||||
|
||||
mov %1, dx
|
||||
%endmacro
|
||||
|
||||
; interpolate16_3<bitFormat,6,1,1>
|
||||
; Mix three pixels with weight 6, 1, and 1, respectively: (c1*6+c2+c3)/8;
|
||||
%macro Interp7 3
|
||||
mov ecx, [_LUT16to32]
|
||||
movd mm1, [ecx+eax*4]
|
||||
mov edx, %2
|
||||
movd mm2, [ecx+edx*4]
|
||||
mov edx, %3
|
||||
movd mm3, [ecx+edx*4]
|
||||
punpcklbw mm1, [reg_blank]
|
||||
punpcklbw mm2, [reg_blank]
|
||||
punpcklbw mm3, [reg_blank]
|
||||
pmullw mm1, [const6]
|
||||
paddw mm2, mm3
|
||||
paddw mm1, mm2
|
||||
psrlw mm1, 5
|
||||
packuswb mm1, [reg_blank]
|
||||
movd edx, mm1
|
||||
shl dl, 2
|
||||
shr edx, 1
|
||||
shl dx, 3
|
||||
shr edx, 5
|
||||
mov %1, dx
|
||||
; Unpack eax to ecx and multiply by 6
|
||||
mov eax, [w5]
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
; multiply c1 by 6
|
||||
;imul ecx, 6 ; imul works, too, but might be slower on older systems?
|
||||
mov edx, ecx
|
||||
add ecx, ecx
|
||||
add ecx, edx
|
||||
add ecx, ecx
|
||||
|
||||
; unpack c2 to edx
|
||||
mov eax, %2
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; add c2 to c1*3
|
||||
add ecx, edx
|
||||
|
||||
; unpack c3 to edx
|
||||
mov eax, %3
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; add c3 and c2+c1*3, divide by 8, mask the result
|
||||
add edx, ecx
|
||||
shr edx, 3
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; finally, repack the mixed pixel
|
||||
mov ecx, edx
|
||||
shr ecx, 16
|
||||
or edx, ecx
|
||||
|
||||
mov %1, dx
|
||||
%endmacro
|
||||
|
||||
; interpolate16_3<bitFormat,2,3,3>
|
||||
; Mix three pixels with weight 2, 3, and 3, respectively: (c1*2+(c2+c3)*3)/8;
|
||||
%macro Interp9 3
|
||||
mov ecx, [_LUT16to32]
|
||||
movd mm1, [ecx+eax*4]
|
||||
mov edx, %2
|
||||
movd mm2, [ecx+edx*4]
|
||||
mov edx, %3
|
||||
movd mm3, [ecx+edx*4]
|
||||
punpcklbw mm1, [reg_blank]
|
||||
punpcklbw mm2, [reg_blank]
|
||||
punpcklbw mm3, [reg_blank]
|
||||
psllw mm1, 1
|
||||
paddw mm2, mm3
|
||||
pmullw mm2, [const3]
|
||||
paddw mm1, mm2
|
||||
psrlw mm1, 5
|
||||
packuswb mm1, [reg_blank]
|
||||
movd edx, mm1
|
||||
shl dl, 2
|
||||
shr edx, 1
|
||||
shl dx, 3
|
||||
shr edx, 5
|
||||
mov %1, dx
|
||||
; unpack c2
|
||||
mov eax, %2
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; unpack c3
|
||||
mov eax, %3
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; sum c2 and c3
|
||||
add edx, ecx
|
||||
|
||||
; multiply (c2+c3) by 3
|
||||
;imul edx, 3 ; imul works, too, but might be slower on older systems?
|
||||
mov ecx, edx
|
||||
add edx, edx
|
||||
add edx, ecx
|
||||
|
||||
; Restore eax, unpack it and multiply by 2
|
||||
mov eax, [w5]
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
add ecx, ecx ; multiply by 2
|
||||
|
||||
; sum 2*eax + 3*(c2+c3), divide by 8, mask the result
|
||||
add edx, ecx
|
||||
shr edx, 3
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; finally, repack the mixed pixel
|
||||
mov ecx, edx
|
||||
shr ecx, 16
|
||||
or edx, ecx
|
||||
|
||||
mov %1, dx
|
||||
%endmacro
|
||||
|
||||
; interpolate16_3<bitFormat,14,1,1>
|
||||
; Mix three pixels with weight 14, 1, and 1, respectively: (c1*14+c2+c3)/16;
|
||||
%macro Interp10 3
|
||||
mov ecx, [_LUT16to32]
|
||||
movd mm1, [ecx+eax*4]
|
||||
mov edx, %2
|
||||
movd mm2, [ecx+edx*4]
|
||||
mov edx, %3
|
||||
movd mm3, [ecx+edx*4]
|
||||
punpcklbw mm1, [reg_blank]
|
||||
punpcklbw mm2, [reg_blank]
|
||||
punpcklbw mm3, [reg_blank]
|
||||
pmullw mm1, [const14]
|
||||
paddw mm2, mm3
|
||||
paddw mm1, mm2
|
||||
psrlw mm1, 6
|
||||
packuswb mm1, [reg_blank]
|
||||
movd edx, mm1
|
||||
shl dl, 2
|
||||
shr edx, 1
|
||||
shl dx, 3
|
||||
shr edx, 5
|
||||
mov %1, dx
|
||||
; Unpack eax to ecx and multiply by 14
|
||||
mov eax, [w5]
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
; multiply c1 by 14
|
||||
;imul ecx, 14 ; imul works, too, but might be slower on older systems?
|
||||
mov edx, ecx
|
||||
shl ecx, 3
|
||||
sub ecx, edx
|
||||
add ecx, ecx
|
||||
|
||||
; unpack c2 to edx
|
||||
mov eax, %2
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; add c2 to c1*14
|
||||
add ecx, edx
|
||||
|
||||
; unpack c3 to edx
|
||||
mov eax, %3
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; add c3 and c2+c1*14, divide by 16, mask the result
|
||||
add edx, ecx
|
||||
shr edx, 4
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; finally, repack the mixed pixel
|
||||
mov ecx, edx
|
||||
shr ecx, 16
|
||||
or edx, ecx
|
||||
|
||||
mov %1, dx
|
||||
%endmacro
|
||||
|
||||
%macro PIXEL00_0 0
|
||||
|
@ -20,10 +20,14 @@
|
||||
|
||||
GLOBAL _hq3x_16
|
||||
|
||||
EXTERN _LUT16to32
|
||||
EXTERN _RGBtoYUV
|
||||
EXTERN _hqx_highbits
|
||||
EXTERN _hqx_lowbits
|
||||
EXTERN _hqx_low2bits
|
||||
EXTERN _hqx_low3bits
|
||||
EXTERN _hqx_greenMask
|
||||
EXTERN _hqx_redBlueMask
|
||||
EXTERN _hqx_green_redBlue_Mask
|
||||
|
||||
SECTION .bss
|
||||
linesleft resd 1
|
||||
@ -41,6 +45,8 @@ w7 resd 1
|
||||
w8 resd 1
|
||||
w9 resd 1
|
||||
|
||||
tmpData resd 1
|
||||
|
||||
SECTION .data
|
||||
|
||||
reg_blank dd 0,0
|
||||
@ -162,48 +168,87 @@ SECTION .text
|
||||
; interpolate16_2<bitFormat,7,1>
|
||||
; Mix two pixels with weight 7 and 1, respectively: (c1*7+c2)/8;
|
||||
%macro Interp3 2
|
||||
mov ecx, [_LUT16to32]
|
||||
movd mm1, [ecx+eax*4]
|
||||
mov edx, %2
|
||||
movd mm2, [ecx+edx*4]
|
||||
punpcklbw mm1, [reg_blank]
|
||||
punpcklbw mm2, [reg_blank]
|
||||
pmullw mm1, [const7]
|
||||
paddw mm1, mm2
|
||||
psrlw mm1, 5
|
||||
packuswb mm1, [reg_blank]
|
||||
movd edx, mm1
|
||||
shl dl, 2
|
||||
shr edx, 1
|
||||
shl dx, 3
|
||||
shr edx, 5
|
||||
mov %1, dx
|
||||
; ((p1&kLowBitsMask)<<2)
|
||||
mov ecx,eax
|
||||
and ecx,[_hqx_lowbits]
|
||||
shl ecx,2
|
||||
|
||||
; + ((p1&kLow2Bits)<<1)
|
||||
mov edx,eax
|
||||
and edx,[_hqx_low2bits]
|
||||
shl edx,1
|
||||
add ecx,edx
|
||||
|
||||
; + (p1&kLow3Bits)
|
||||
mov edx,eax
|
||||
and edx,[_hqx_low3bits]
|
||||
add ecx,edx
|
||||
|
||||
; + (p2&kLow3Bits)
|
||||
mov edx,%2
|
||||
and edx,[_hqx_low3bits]
|
||||
add ecx,edx
|
||||
|
||||
; & kLow3Bits -> ecx
|
||||
and ecx,[_hqx_low3bits]
|
||||
|
||||
; compute ((p1*7+p2) - ecx) >> 3;
|
||||
mov edx,eax
|
||||
shl edx,3
|
||||
sub edx,eax
|
||||
sub edx,ecx
|
||||
mov ecx,%2
|
||||
add edx,ecx
|
||||
shr edx,3
|
||||
|
||||
mov %1,dx
|
||||
%endmacro
|
||||
|
||||
; interpolate16_3<bitFormat,2,7,7>
|
||||
; Mix three pixels with weight 2, 7, and 7, respectively: (c1*2+(c2+c3)*7)/16;
|
||||
%macro Interp4 3
|
||||
mov ecx, [_LUT16to32]
|
||||
movd mm1, [ecx+eax*4]
|
||||
mov edx, %2
|
||||
movd mm2, [ecx+edx*4]
|
||||
mov edx, %3
|
||||
movd mm3, [ecx+edx*4]
|
||||
punpcklbw mm1, [reg_blank]
|
||||
punpcklbw mm2, [reg_blank]
|
||||
punpcklbw mm3, [reg_blank]
|
||||
psllw mm1, 1
|
||||
paddw mm2, mm3
|
||||
pmullw mm2, [const7]
|
||||
paddw mm1, mm2
|
||||
psrlw mm1, 6
|
||||
packuswb mm1, [reg_blank]
|
||||
movd edx, mm1
|
||||
shl dl, 2
|
||||
shr edx, 1
|
||||
shl dx, 3
|
||||
shr edx, 5
|
||||
mov %1, dx
|
||||
; unpack c2
|
||||
mov eax, %2
|
||||
mov edx, eax
|
||||
shl edx, 16
|
||||
or edx, eax
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; unpack c3
|
||||
mov eax, %3
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; sum c2 and c3
|
||||
add edx, ecx
|
||||
|
||||
; multiply (c2+c3) by 7
|
||||
;imul edx, 7 ; imul works, too, but might be slower on older systems?
|
||||
mov ecx, edx
|
||||
shl edx, 3
|
||||
sub edx, ecx
|
||||
|
||||
; Restore eax, unpack it and multiply by 2
|
||||
mov eax, [w5]
|
||||
mov ecx, eax
|
||||
shl ecx, 16
|
||||
or ecx, eax
|
||||
and ecx, [_hqx_green_redBlue_Mask]
|
||||
add ecx, ecx ; multiply by 2
|
||||
|
||||
; sum 2*eax + 7*(c2+c3), divide by 16, mask the result
|
||||
add edx, ecx
|
||||
shr edx, 4
|
||||
and edx, [_hqx_green_redBlue_Mask]
|
||||
|
||||
; finally, repack the mixed pixel
|
||||
mov ecx, edx
|
||||
shr ecx, 16
|
||||
or edx, ecx
|
||||
|
||||
mov %1, dx
|
||||
%endmacro
|
||||
|
||||
; interpolate16_2<bitFormat,1,1>
|
||||
@ -211,9 +256,14 @@ SECTION .text
|
||||
%macro Interp5 3
|
||||
mov edx,%2
|
||||
mov ecx,%3
|
||||
and edx,[_hqx_highbits]
|
||||
and ecx,[_hqx_highbits]
|
||||
add edx,ecx
|
||||
|
||||
xor edx,ecx ; xor pixels
|
||||
mov [tmpData],edx ; store tmp result
|
||||
xor edx,ecx ; restore original value of edx (avoids a reload)
|
||||
add edx,ecx ; sum pixels
|
||||
mov ecx,[tmpData]
|
||||
and ecx,[_hqx_lowbits]
|
||||
sub edx,ecx
|
||||
shr edx,1
|
||||
mov %1,dx
|
||||
%endmacro
|
||||
|
Loading…
Reference in New Issue
Block a user