Updates to the scummvm blitting code as discussed on the mailing list.

1) Remove DS version of the ARM blitters in favour of the 'normal' ARM one.
 2) Update normal ARM blitter to use Carlo's clever algorithm.
 3) Update C version with Max Horns patch (slightly tweaked - counting down
on loops is better, M'kay).

svn-id: r34006
This commit is contained in:
Robin Watts 2008-08-18 20:04:15 +00:00
parent ca3dcdfd4e
commit 90b59af2ba
3 changed files with 61 additions and 204 deletions

View File

@ -20,149 +20,12 @@
@
@ @author Robin Watts (robin@wss.co.uk)
.global asmDrawStripToScreen
.global asmCopy8Col
.global Rescale_320x256xPAL8_To_256x256x1555
.global Rescale_320x256x1555_To_256x256x1555
.section .itcm,"ax", %progbits
.align 2
.code 32
@ ARM implementation of asmDrawStripToScreen.
@
@ C prototype would be:
@
@ extern "C" void asmDrawStripToScreen(int height,
@ int width,
@ byte const *text,
@ byte const *src,
@ byte *dst,
@ int vsPitch,
@ int vsScreenWidth,
@ int textSurfacePitch);
@
@ In addition, we assume that text, src and dst are all word (4 byte)
@ aligned. This is the same assumption that the old 'inline' version
@ made.
asmDrawStripToScreen:
@ r0 = height
@ r1 = width
@ r2 = text
@ r3 = src
MOV r12,r13
STMFD r13!,{r4-r7,r9-r11,R14}
LDMIA r12,{r4,r5,r6,r7}
@ r4 = dst
@ r5 = vsPitch
@ r6 = vmScreenWidth
@ r7 = textSurfacePitch
CMP r0,#0 @ If height<=0
MOVLE r0,#1 @ height=1
CMP r1,#4 @ If width<4
BLT end @ return
@ Width &= ~4 ? What's that about then? Width &= ~3 I could have
@ understood...
BIC r1,r1,#4
SUB r5,r5,r1 @ vsPitch -= width
SUB r6,r6,r1 @ vmScreenWidth -= width
SUB r7,r7,r1 @ textSurfacePitch -= width
MOV r10,#253
ORR r10,r10,r10,LSL #8
ORR r10,r10,r10,LSL #16 @ r10 = mask
yLoop:
MOV r14,r1 @ r14 = width
xLoop:
LDR r12,[r2],#4 @ r12 = [text]
LDR r11,[r3],#4 @ r11 = [src]
CMP r12,r10
BNE singleByteCompare
SUBS r14,r14,#4
STR r11,[r4], #4 @ r4 = [dst]
BGT xLoop
ADD r2,r2,r7 @ text += textSurfacePitch
ADD r3,r3,r5 @ src += vsPitch
ADD r4,r4,r6 @ dst += vmScreenWidth
SUBS r0,r0,#1
BGT yLoop
LDMFD r13!,{r4-r7,r9-r11,PC}
singleByteCompare:
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
STR r12,[r4],#4
SUBS r14,r14,#4
BGT xLoop
ADD r2,r2,r7 @ text += textSurfacePitch
ADD r3,r3,r5 @ src += vsPitch
ADD r4,r4,r6 @ dst += vmScreenWidth
SUBS r0,r0,#1
BGT yLoop
end:
LDMFD r13!,{r4-r7,r9-r11,PC}
@ ARM implementation of asmCopy8Col
@
@ C prototype would be:
@
@ extern "C" void asmCopy8Col(byte *dst,
@ int dstPitch,
@ const byte *src,
@ int height);
@
@ In addition, we assume that src and dst are both word (4 byte)
@ aligned. This is the same assumption that the old 'inline' version
@ made.
asmCopy8Col:
@ r0 = dst
@ r1 = dstPitch
@ r2 = src
@ r3 = height
STMFD r13!,{r14}
SUB r1,r1,#4
TST r3,#1
ADDNE r3,r3,#1
BNE roll2
yLoop2:
LDR r12,[r2],#4
LDR r14,[r2],r1
STR r12,[r0],#4
STR r14,[r0],r1
roll2:
LDR r12,[r2],#4
LDR r14,[r2],r1
SUBS r3,r3,#2
STR r12,[r0],#4
STR r14,[r0],r1
BNE yLoop2
LDMFD r13!,{PC}
@ ARM implementation of Rescale_320x256x1555_To_256x256x1555
@
@ C prototype would be:

View File

@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
assert(0 == (width & 3));
// Compose the text over the game graphics
// TODO: Optimize this code. There are several things that come immediately to mind:
// (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
// a multiple of 8 here.
// (2) More ASM versions (in particular, the ARM code for the NDS could be used on
// all ARM systems, couldn't it?)
// (3) Better encoding of the text surface data. This is the one with the biggest
// potential.
// (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
// is that most rows won't contain any text data, so we can just use memcpy.
// (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
// but also more complicated to implement, and incurs a bigger overhead when
// writing to the text surface.
#ifdef USE_ARM_GFX_ASM
asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
#else
for (int h = 0; h < height * m; ++h) {
for (int w = 0; w < width * m; ++w) {
byte tmp = *text++;
if (tmp == CHARSET_MASK_TRANSPARENCY)
tmp = *src;
*dst++ = tmp;
src++;
// We blit four pixels at a time, for improved performance.
const uint32 *src32 = (const uint32 *)src;
const uint32 *text32 = (const uint32 *)text;
uint32 *dst32 = (uint32 *)dst;
vsPitch >>= 2;
const int textPitch = (_textSurface.pitch - width * m) >> 2;
for (int h = height * m; h > 0; --h) {
for (int w = width*m; w > 0; w-=4) {
uint32 temp = *text32++;
// Generate a byte mask for those text pixels (bytes) with
// value CHARSET_MASK_TRANSPARENCY. In the end, each byte
// in mask will be either equal to 0x00 or 0xFF.
// Doing it this way avoids branches and bytewise operations,
// at the cost of readability ;).
uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
// The following line is equivalent to this code:
// *dst32++ = (*src32++ & mask) | (temp & ~mask);
// However, some compilers can generate somewhat better
// machine code for this equivalent statement:
*dst32++ = ((temp ^ *src32++) & mask) ^ temp;
}
src += vsPitch;
text += _textSurface.pitch - width * m;
src32 += vsPitch;
text32 += textPitch;
}
#endif
src = _compositeBuf;

View File

@ -24,7 +24,7 @@
.global asmDrawStripToScreen
.global asmCopy8Col
@ ARM implementation of asmDrawStripToScreen.
@
@ C prototype would be:
@ -47,7 +47,7 @@ asmDrawStripToScreen:
@ r2 = text
@ r3 = src
MOV r12,r13
STMFD r13!,{r4-r7,r9-r11,R14}
STMFD r13!,{r4-r11,R14}
LDMIA r12,{r4,r5,r6,r7}
@ r4 = dst
@ r5 = vsPitch
@ -69,57 +69,46 @@ asmDrawStripToScreen:
MOV r10,#253
ORR r10,r10,r10,LSL #8
ORR r10,r10,r10,LSL #16 @ r10 = mask
yLoop:
MOV r14,r1 @ r14 = width
MOV r8,#0x7F
ORR r8, r8, r8, LSL #8
ORR r8, r8, r8, LSL #16 @ r8 = 7f7f7f7f
STR r1,[r13,#-4]! @ Stack width
B xLoop
notEntirelyTransparent:
AND r14,r9, r8 @ r14 = mask & 7f7f7f7f
ADD r14,r14,r8 @ r14 = (mask & 7f7f7f7f)+7f7f7f7f
ORR r14,r14,r9 @ r14 |= mask
BIC r14,r14,r8 @ r14 &= 80808080
ADD r14,r8, r14,LSR #7 @ r14 = (rx>>7) + 7f7f7f7f
EOR r14,r14,r8 @ r14 ^= 7f7f7f7f
@ So bytes of r14 are 00 where source was matching value,FF otherwise
BIC r11,r11,r14
AND r12,r12,r14
ORR r12,r11,r12
STR r12,[r4],#4
SUBS r1,r1,#4
BLE endXLoop
xLoop:
LDR r12,[r2],#4 @ r12 = [text]
LDR r11,[r3],#4 @ r11 = [src]
CMP r12,r10
BNE singleByteCompare
SUBS r14,r14,#4
LDR r12,[r2],#4 @ r12 = temp = [text]
LDR r11,[r3],#4 @ r11 = [src]
@ Stall
EORS r9, r12,r10 @ r9 = mask = temp ^ TRANSPARENCY
BNE notEntirelyTransparent
SUBS r1, r1, #4
STR r11,[r4], #4 @ r4 = [dst]
BGT xLoop
endXLoop:
ADD r2,r2,r7 @ text += textSurfacePitch
ADD r3,r3,r5 @ src += vsPitch
ADD r4,r4,r6 @ dst += vmScreenWidth
SUBS r0,r0,#1
BGT yLoop
LDMFD r13!,{r4-r7,r9-r11,PC}
singleByteCompare:
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
CMP r9,r10,LSR #24 @ if (r9 == mask)
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
STR r12,[r4],#4
SUBS r14,r14,#4
LDRGT r1,[r13] @ r14 = width
BGT xLoop
ADD r2,r2,r7 @ text += textSurfacePitch
ADD r3,r3,r5 @ src += vsPitch
ADD r4,r4,r6 @ dst += vmScreenWidth
SUBS r0,r0,#1
BGT yLoop
ADD r13,r13,#4
end:
LDMFD r13!,{r4-r7,r9-r11,PC}
LDMFD r13!,{r4-r11,PC}
@ ARM implementation of asmCopy8Col
@
@ C prototype would be:
@ -156,4 +145,4 @@ roll2:
STR r14,[r0],r1
BNE yLoop2
LDMFD r13!,{PC}
LDMFD r13!,{PC}