Updates to the scummvm blitting code as discussed on the mailing list.

1) Remove DS version of the ARM blitters in favour of the 'normal' ARM one. 2) Update normal ARM blitter to use Carlo's clever algorithm. 3) Update C version with Max Horns patch (slightly tweaked - counting down on loops is better, M'kay). svn-id: r34006
2024-12-14 21:59:17 +00:00 · 2008-08-18 20:04:15 +00:00 · 2008-08-18 20:04:15 +00:00 · 90b59af2ba
commit 90b59af2ba
parent ca3dcdfd4e
3 changed files with 61 additions and 204 deletions
--- a/backends/platform/ds/arm9/source/blitters_arm.s
+++ b/backends/platform/ds/arm9/source/blitters_arm.s
@ -20,149 +20,12 @@
@
@ @author Robin Watts (robin@wss.co.uk)

-	.global	asmDrawStripToScreen
-	.global	asmCopy8Col
 	.global	Rescale_320x256xPAL8_To_256x256x1555
 	.global	Rescale_320x256x1555_To_256x256x1555
 	.section .itcm,"ax", %progbits
 	.align 2
 	.code 32

-	@ ARM implementation of asmDrawStripToScreen.
-	@
-	@ C prototype would be:
-	@
-	@ extern "C" void asmDrawStripToScreen(int         height,
-	@                                      int         width,
-	@                                      byte const *text,
-	@                                      byte const *src,
-	@                                      byte       *dst,
-	@                                      int         vsPitch,
-	@                                      int         vsScreenWidth,
-	@                                      int         textSurfacePitch);
-	@
-	@ In addition, we assume that text, src and dst are all word (4 byte)
-	@ aligned. This is the same assumption that the old 'inline' version
-	@ made.
-asmDrawStripToScreen:
-	@ r0 = height
-	@ r1 = width
-	@ r2 = text
-	@ r3 = src
-	MOV	r12,r13
-	STMFD	r13!,{r4-r7,r9-r11,R14}
-	LDMIA	r12,{r4,r5,r6,r7}
-	@ r4 = dst
-	@ r5 = vsPitch
-	@ r6 = vmScreenWidth
-	@ r7 = textSurfacePitch
-
-	CMP	r0,#0			@ If height<=0
-	MOVLE	r0,#1			@    height=1
-	CMP	r1,#4			@ If width<4
-	BLT	end			@    return
-
-	@ Width &= ~4 ? What's that about then? Width &= ~3 I could have
-	@ understood...
-	BIC	r1,r1,#4
-
-	SUB	r5,r5,r1		@ vsPitch          -= width
-	SUB	r6,r6,r1		@ vmScreenWidth    -= width
-	SUB	r7,r7,r1		@ textSurfacePitch -= width
-	MOV	r10,#253
-	ORR	r10,r10,r10,LSL #8
-	ORR	r10,r10,r10,LSL #16	@ r10 = mask
-yLoop:
-	MOV	r14,r1			@ r14 = width
-xLoop:
-	LDR	r12,[r2],#4		@ r12 = [text]
-	LDR	r11,[r3],#4		@ r11 = [src]
-	CMP	r12,r10
-	BNE	singleByteCompare
-	SUBS	r14,r14,#4
-	STR	r11,[r4], #4		@ r4 = [dst]
-	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	STR	r12,[r4],#4
-	SUBS	r14,r14,#4
-	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
-end:
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-
-	@ ARM implementation of asmCopy8Col
-	@
-	@ C prototype would be:
-	@
-	@ extern "C" void asmCopy8Col(byte       *dst,
-	@                             int         dstPitch,
-	@                             const byte *src,
-	@                             int         height);
-	@
-	@ In addition, we assume that src and dst are both word (4 byte)
-	@ aligned. This is the same assumption that the old 'inline' version
-	@ made.
-asmCopy8Col:
-	@ r0 = dst
-	@ r1 = dstPitch
-	@ r2 = src
-	@ r3 = height
-	STMFD	r13!,{r14}
-	SUB	r1,r1,#4
-
-	TST	r3,#1
-	ADDNE   r3,r3,#1
-	BNE	roll2
-yLoop2:
-	LDR	r12,[r2],#4
-	LDR	r14,[r2],r1
-	STR	r12,[r0],#4
-	STR	r14,[r0],r1
-roll2:
-	LDR	r12,[r2],#4
-	LDR	r14,[r2],r1
-	SUBS	r3,r3,#2
-	STR	r12,[r0],#4
-	STR	r14,[r0],r1
-	BNE	yLoop2
-
-	LDMFD	r13!,{PC}
-
-
 	@ ARM implementation of Rescale_320x256x1555_To_256x256x1555
 	@
 	@ C prototype would be:
--- a/engines/scumm/gfx.cpp
+++ b/engines/scumm/gfx.cpp
@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
 		assert(0 == (width & 3));

 		// Compose the text over the game graphics
-
-		// TODO: Optimize this code. There are several things that come immediately to mind:
-		// (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
-		//     a multiple of 8 here.
-		// (2) More ASM versions (in particular, the ARM code for the NDS could be used on
-		//     all ARM systems, couldn't it?)
-		// (3) Better encoding of the text surface data. This is the one with the biggest
-		//     potential.
-		//     (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
-		//         is that most rows won't contain any text data, so we can just use memcpy.
-		//     (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
-		//         but also more complicated to implement, and incurs a bigger overhead when
-		//         writing to the text surface.
 #ifdef USE_ARM_GFX_ASM
 		asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
 #else
-		for (int h = 0; h < height * m; ++h) {
-			for (int w = 0; w < width * m; ++w) {
-				byte tmp = *text++;
-				if (tmp == CHARSET_MASK_TRANSPARENCY)
-					tmp = *src;
-				*dst++ = tmp;
-				src++;
+		// We blit four pixels at a time, for improved performance.
+		const uint32 *src32 = (const uint32 *)src;
+		const uint32 *text32 = (const uint32 *)text;
+		uint32 *dst32 = (uint32 *)dst;
+		
+		vsPitch >>= 2;
+		const int textPitch = (_textSurface.pitch - width * m) >> 2;
+		for (int h = height * m; h > 0; --h) {
+			for (int w = width*m; w > 0; w-=4) {
+				uint32 temp = *text32++;
+				
+				// Generate a byte mask for those text pixels (bytes) with
+				// value CHARSET_MASK_TRANSPARENCY. In the end, each byte
+				// in mask will be either equal to 0x00 or 0xFF.
+				// Doing it this way avoids branches and bytewise operations,
+				// at the cost of readability ;).
+				uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
+				mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
+				mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
+				
+				// The following line is equivalent to this code:
+				//   *dst32++ = (*src32++ & mask) | (temp & ~mask);
+				// However, some compilers can generate somewhat better
+				// machine code for this equivalent statement:
+				*dst32++ = ((temp ^ *src32++) & mask) ^ temp;
 			}
-			src += vsPitch;
-			text += _textSurface.pitch - width * m;
+			src32 += vsPitch;
+			text32 += textPitch;
 		}
 #endif
 		src = _compositeBuf;
--- a/engines/scumm/gfxARM.s
+++ b/engines/scumm/gfxARM.s
@ -24,7 +24,7 @@

 	.global	asmDrawStripToScreen
 	.global	asmCopy8Col
-	
+
 	@ ARM implementation of asmDrawStripToScreen.
 	@
 	@ C prototype would be:
@ -47,7 +47,7 @@ asmDrawStripToScreen:
 	@ r2 = text
 	@ r3 = src
 	MOV	r12,r13
-	STMFD	r13!,{r4-r7,r9-r11,R14}
+	STMFD	r13!,{r4-r11,R14}
 	LDMIA	r12,{r4,r5,r6,r7}
 	@ r4 = dst
 	@ r5 = vsPitch
@ -69,57 +69,46 @@ asmDrawStripToScreen:
 	MOV	r10,#253
 	ORR	r10,r10,r10,LSL #8
 	ORR	r10,r10,r10,LSL #16	@ r10 = mask
-yLoop:
-	MOV	r14,r1			@ r14 = width
+	MOV	r8,#0x7F
+	ORR	r8, r8, r8, LSL #8
+	ORR	r8, r8, r8, LSL #16	@ r8  = 7f7f7f7f
+	STR	r1,[r13,#-4]!		@ Stack width
+	B	xLoop
+
+notEntirelyTransparent:
+	AND	r14,r9, r8		@ r14  =  mask & 7f7f7f7f
+	ADD	r14,r14,r8		@ r14  = (mask & 7f7f7f7f)+7f7f7f7f
+	ORR	r14,r14,r9		@ r14 |= mask
+	BIC	r14,r14,r8		@ r14 &= 80808080
+	ADD	r14,r8, r14,LSR #7	@ r14  = (rx>>7) + 7f7f7f7f
+	EOR	r14,r14,r8		@ r14 ^= 7f7f7f7f
+	@ So bytes of r14 are 00 where source was matching value,FF otherwise
+	BIC	r11,r11,r14
+	AND	r12,r12,r14
+	ORR	r12,r11,r12
+	STR	r12,[r4],#4
+	SUBS	r1,r1,#4
+	BLE	endXLoop
 xLoop:
-	LDR	r12,[r2],#4		@ r12 = [text]
-	LDR	r11,[r3],#4		@ r11 = [src]
-	CMP	r12,r10
-	BNE	singleByteCompare
-	SUBS	r14,r14,#4
+	LDR	r12,[r2],#4		@ r12 = temp = [text]
+	LDR	r11,[r3],#4		@ r11 =        [src]
+	@ Stall
+	EORS	r9, r12,r10		@ r9  = mask = temp ^ TRANSPARENCY
+	BNE	notEntirelyTransparent
+	SUBS	r1, r1, #4
 	STR	r11,[r4], #4		@ r4 = [dst]
 	BGT	xLoop
-
+endXLoop:
 	ADD	r2,r2,r7		@ text += textSurfacePitch
 	ADD	r3,r3,r5		@ src  += vsPitch
 	ADD	r4,r4,r6		@ dst  += vmScreenWidth
 	SUBS	r0,r0,#1
-	BGT	yLoop
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-
-singleByteCompare:
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	MOV	r9,r12,LSR #24		@ r9 = 1st byte of [text]
-	CMP	r9,r10,LSR #24		@ if (r9 == mask)
-	MOVEQ	r9,r11,LSR #24		@     r9 = 1st byte of [src]
-	ORR	r12,r9,r12,LSL #8	@ r12 = combine r9 and r12
-
-	STR	r12,[r4],#4
-	SUBS	r14,r14,#4
+	LDRGT	r1,[r13]		@ r14 = width
 	BGT	xLoop
-
-	ADD	r2,r2,r7		@ text += textSurfacePitch
-	ADD	r3,r3,r5		@ src  += vsPitch
-	ADD	r4,r4,r6		@ dst  += vmScreenWidth
-	SUBS	r0,r0,#1
-	BGT	yLoop
+	ADD	r13,r13,#4
 end:
-	LDMFD	r13!,{r4-r7,r9-r11,PC}
-	
+	LDMFD	r13!,{r4-r11,PC}
+
 	@ ARM implementation of asmCopy8Col
 	@
 	@ C prototype would be:
@ -156,4 +145,4 @@ roll2:
 	STR	r14,[r0],r1
 	BNE	yLoop2

-	LDMFD	r13!,{PC}	
+	LDMFD	r13!,{PC}