mirror of
https://github.com/libretro/scummvm.git
synced 2024-12-14 21:59:17 +00:00
Updates to the scummvm blitting code as discussed on the mailing list.
1) Remove DS version of the ARM blitters in favour of the 'normal' ARM one. 2) Update normal ARM blitter to use Carlo's clever algorithm. 3) Update C version with Max Horns patch (slightly tweaked - counting down on loops is better, M'kay). svn-id: r34006
This commit is contained in:
parent
ca3dcdfd4e
commit
90b59af2ba
@ -20,149 +20,12 @@
|
||||
@
|
||||
@ @author Robin Watts (robin@wss.co.uk)
|
||||
|
||||
.global asmDrawStripToScreen
|
||||
.global asmCopy8Col
|
||||
.global Rescale_320x256xPAL8_To_256x256x1555
|
||||
.global Rescale_320x256x1555_To_256x256x1555
|
||||
.section .itcm,"ax", %progbits
|
||||
.align 2
|
||||
.code 32
|
||||
|
||||
@ ARM implementation of asmDrawStripToScreen.
|
||||
@
|
||||
@ C prototype would be:
|
||||
@
|
||||
@ extern "C" void asmDrawStripToScreen(int height,
|
||||
@ int width,
|
||||
@ byte const *text,
|
||||
@ byte const *src,
|
||||
@ byte *dst,
|
||||
@ int vsPitch,
|
||||
@ int vsScreenWidth,
|
||||
@ int textSurfacePitch);
|
||||
@
|
||||
@ In addition, we assume that text, src and dst are all word (4 byte)
|
||||
@ aligned. This is the same assumption that the old 'inline' version
|
||||
@ made.
|
||||
asmDrawStripToScreen:
|
||||
@ r0 = height
|
||||
@ r1 = width
|
||||
@ r2 = text
|
||||
@ r3 = src
|
||||
MOV r12,r13
|
||||
STMFD r13!,{r4-r7,r9-r11,R14}
|
||||
LDMIA r12,{r4,r5,r6,r7}
|
||||
@ r4 = dst
|
||||
@ r5 = vsPitch
|
||||
@ r6 = vmScreenWidth
|
||||
@ r7 = textSurfacePitch
|
||||
|
||||
CMP r0,#0 @ If height<=0
|
||||
MOVLE r0,#1 @ height=1
|
||||
CMP r1,#4 @ If width<4
|
||||
BLT end @ return
|
||||
|
||||
@ Width &= ~4 ? What's that about then? Width &= ~3 I could have
|
||||
@ understood...
|
||||
BIC r1,r1,#4
|
||||
|
||||
SUB r5,r5,r1 @ vsPitch -= width
|
||||
SUB r6,r6,r1 @ vmScreenWidth -= width
|
||||
SUB r7,r7,r1 @ textSurfacePitch -= width
|
||||
MOV r10,#253
|
||||
ORR r10,r10,r10,LSL #8
|
||||
ORR r10,r10,r10,LSL #16 @ r10 = mask
|
||||
yLoop:
|
||||
MOV r14,r1 @ r14 = width
|
||||
xLoop:
|
||||
LDR r12,[r2],#4 @ r12 = [text]
|
||||
LDR r11,[r3],#4 @ r11 = [src]
|
||||
CMP r12,r10
|
||||
BNE singleByteCompare
|
||||
SUBS r14,r14,#4
|
||||
STR r11,[r4], #4 @ r4 = [dst]
|
||||
BGT xLoop
|
||||
|
||||
ADD r2,r2,r7 @ text += textSurfacePitch
|
||||
ADD r3,r3,r5 @ src += vsPitch
|
||||
ADD r4,r4,r6 @ dst += vmScreenWidth
|
||||
SUBS r0,r0,#1
|
||||
BGT yLoop
|
||||
LDMFD r13!,{r4-r7,r9-r11,PC}
|
||||
|
||||
singleByteCompare:
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
STR r12,[r4],#4
|
||||
SUBS r14,r14,#4
|
||||
BGT xLoop
|
||||
|
||||
ADD r2,r2,r7 @ text += textSurfacePitch
|
||||
ADD r3,r3,r5 @ src += vsPitch
|
||||
ADD r4,r4,r6 @ dst += vmScreenWidth
|
||||
SUBS r0,r0,#1
|
||||
BGT yLoop
|
||||
end:
|
||||
LDMFD r13!,{r4-r7,r9-r11,PC}
|
||||
|
||||
|
||||
@ ARM implementation of asmCopy8Col
|
||||
@
|
||||
@ C prototype would be:
|
||||
@
|
||||
@ extern "C" void asmCopy8Col(byte *dst,
|
||||
@ int dstPitch,
|
||||
@ const byte *src,
|
||||
@ int height);
|
||||
@
|
||||
@ In addition, we assume that src and dst are both word (4 byte)
|
||||
@ aligned. This is the same assumption that the old 'inline' version
|
||||
@ made.
|
||||
asmCopy8Col:
|
||||
@ r0 = dst
|
||||
@ r1 = dstPitch
|
||||
@ r2 = src
|
||||
@ r3 = height
|
||||
STMFD r13!,{r14}
|
||||
SUB r1,r1,#4
|
||||
|
||||
TST r3,#1
|
||||
ADDNE r3,r3,#1
|
||||
BNE roll2
|
||||
yLoop2:
|
||||
LDR r12,[r2],#4
|
||||
LDR r14,[r2],r1
|
||||
STR r12,[r0],#4
|
||||
STR r14,[r0],r1
|
||||
roll2:
|
||||
LDR r12,[r2],#4
|
||||
LDR r14,[r2],r1
|
||||
SUBS r3,r3,#2
|
||||
STR r12,[r0],#4
|
||||
STR r14,[r0],r1
|
||||
BNE yLoop2
|
||||
|
||||
LDMFD r13!,{PC}
|
||||
|
||||
|
||||
@ ARM implementation of Rescale_320x256x1555_To_256x256x1555
|
||||
@
|
||||
@ C prototype would be:
|
||||
|
@ -612,32 +612,37 @@ void ScummEngine::drawStripToScreen(VirtScreen *vs, int x, int width, int top, i
|
||||
assert(0 == (width & 3));
|
||||
|
||||
// Compose the text over the game graphics
|
||||
|
||||
// TODO: Optimize this code. There are several things that come immediately to mind:
|
||||
// (1) Loop unrolling: We could read 4 or even 8 pixels at once, since everything is
|
||||
// a multiple of 8 here.
|
||||
// (2) More ASM versions (in particular, the ARM code for the NDS could be used on
|
||||
// all ARM systems, couldn't it?)
|
||||
// (3) Better encoding of the text surface data. This is the one with the biggest
|
||||
// potential.
|
||||
// (a) Keep an "isEmpty" marker for each pixel row in the _textSurface. The idea
|
||||
// is that most rows won't contain any text data, so we can just use memcpy.
|
||||
// (b) RLE encode the _textSurface row-wise. This is an improved variant of (a),
|
||||
// but also more complicated to implement, and incurs a bigger overhead when
|
||||
// writing to the text surface.
|
||||
#ifdef USE_ARM_GFX_ASM
|
||||
asmDrawStripToScreen(height, width, text, src, dst, vs->pitch, width, _textSurface.pitch);
|
||||
#else
|
||||
for (int h = 0; h < height * m; ++h) {
|
||||
for (int w = 0; w < width * m; ++w) {
|
||||
byte tmp = *text++;
|
||||
if (tmp == CHARSET_MASK_TRANSPARENCY)
|
||||
tmp = *src;
|
||||
*dst++ = tmp;
|
||||
src++;
|
||||
// We blit four pixels at a time, for improved performance.
|
||||
const uint32 *src32 = (const uint32 *)src;
|
||||
const uint32 *text32 = (const uint32 *)text;
|
||||
uint32 *dst32 = (uint32 *)dst;
|
||||
|
||||
vsPitch >>= 2;
|
||||
const int textPitch = (_textSurface.pitch - width * m) >> 2;
|
||||
for (int h = height * m; h > 0; --h) {
|
||||
for (int w = width*m; w > 0; w-=4) {
|
||||
uint32 temp = *text32++;
|
||||
|
||||
// Generate a byte mask for those text pixels (bytes) with
|
||||
// value CHARSET_MASK_TRANSPARENCY. In the end, each byte
|
||||
// in mask will be either equal to 0x00 or 0xFF.
|
||||
// Doing it this way avoids branches and bytewise operations,
|
||||
// at the cost of readability ;).
|
||||
uint32 mask = temp ^ CHARSET_MASK_TRANSPARENCY_32;
|
||||
mask = (((mask & 0x7f7f7f7f) + 0x7f7f7f7f) | mask) & 0x80808080;
|
||||
mask = ((mask >> 7) + 0x7f7f7f7f) ^ 0x80808080;
|
||||
|
||||
// The following line is equivalent to this code:
|
||||
// *dst32++ = (*src32++ & mask) | (temp & ~mask);
|
||||
// However, some compilers can generate somewhat better
|
||||
// machine code for this equivalent statement:
|
||||
*dst32++ = ((temp ^ *src32++) & mask) ^ temp;
|
||||
}
|
||||
src += vsPitch;
|
||||
text += _textSurface.pitch - width * m;
|
||||
src32 += vsPitch;
|
||||
text32 += textPitch;
|
||||
}
|
||||
#endif
|
||||
src = _compositeBuf;
|
||||
|
@ -24,7 +24,7 @@
|
||||
|
||||
.global asmDrawStripToScreen
|
||||
.global asmCopy8Col
|
||||
|
||||
|
||||
@ ARM implementation of asmDrawStripToScreen.
|
||||
@
|
||||
@ C prototype would be:
|
||||
@ -47,7 +47,7 @@ asmDrawStripToScreen:
|
||||
@ r2 = text
|
||||
@ r3 = src
|
||||
MOV r12,r13
|
||||
STMFD r13!,{r4-r7,r9-r11,R14}
|
||||
STMFD r13!,{r4-r11,R14}
|
||||
LDMIA r12,{r4,r5,r6,r7}
|
||||
@ r4 = dst
|
||||
@ r5 = vsPitch
|
||||
@ -69,57 +69,46 @@ asmDrawStripToScreen:
|
||||
MOV r10,#253
|
||||
ORR r10,r10,r10,LSL #8
|
||||
ORR r10,r10,r10,LSL #16 @ r10 = mask
|
||||
yLoop:
|
||||
MOV r14,r1 @ r14 = width
|
||||
MOV r8,#0x7F
|
||||
ORR r8, r8, r8, LSL #8
|
||||
ORR r8, r8, r8, LSL #16 @ r8 = 7f7f7f7f
|
||||
STR r1,[r13,#-4]! @ Stack width
|
||||
B xLoop
|
||||
|
||||
notEntirelyTransparent:
|
||||
AND r14,r9, r8 @ r14 = mask & 7f7f7f7f
|
||||
ADD r14,r14,r8 @ r14 = (mask & 7f7f7f7f)+7f7f7f7f
|
||||
ORR r14,r14,r9 @ r14 |= mask
|
||||
BIC r14,r14,r8 @ r14 &= 80808080
|
||||
ADD r14,r8, r14,LSR #7 @ r14 = (rx>>7) + 7f7f7f7f
|
||||
EOR r14,r14,r8 @ r14 ^= 7f7f7f7f
|
||||
@ So bytes of r14 are 00 where source was matching value,FF otherwise
|
||||
BIC r11,r11,r14
|
||||
AND r12,r12,r14
|
||||
ORR r12,r11,r12
|
||||
STR r12,[r4],#4
|
||||
SUBS r1,r1,#4
|
||||
BLE endXLoop
|
||||
xLoop:
|
||||
LDR r12,[r2],#4 @ r12 = [text]
|
||||
LDR r11,[r3],#4 @ r11 = [src]
|
||||
CMP r12,r10
|
||||
BNE singleByteCompare
|
||||
SUBS r14,r14,#4
|
||||
LDR r12,[r2],#4 @ r12 = temp = [text]
|
||||
LDR r11,[r3],#4 @ r11 = [src]
|
||||
@ Stall
|
||||
EORS r9, r12,r10 @ r9 = mask = temp ^ TRANSPARENCY
|
||||
BNE notEntirelyTransparent
|
||||
SUBS r1, r1, #4
|
||||
STR r11,[r4], #4 @ r4 = [dst]
|
||||
BGT xLoop
|
||||
|
||||
endXLoop:
|
||||
ADD r2,r2,r7 @ text += textSurfacePitch
|
||||
ADD r3,r3,r5 @ src += vsPitch
|
||||
ADD r4,r4,r6 @ dst += vmScreenWidth
|
||||
SUBS r0,r0,#1
|
||||
BGT yLoop
|
||||
LDMFD r13!,{r4-r7,r9-r11,PC}
|
||||
|
||||
singleByteCompare:
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
MOV r9,r12,LSR #24 @ r9 = 1st byte of [text]
|
||||
CMP r9,r10,LSR #24 @ if (r9 == mask)
|
||||
MOVEQ r9,r11,LSR #24 @ r9 = 1st byte of [src]
|
||||
ORR r12,r9,r12,LSL #8 @ r12 = combine r9 and r12
|
||||
|
||||
STR r12,[r4],#4
|
||||
SUBS r14,r14,#4
|
||||
LDRGT r1,[r13] @ r14 = width
|
||||
BGT xLoop
|
||||
|
||||
ADD r2,r2,r7 @ text += textSurfacePitch
|
||||
ADD r3,r3,r5 @ src += vsPitch
|
||||
ADD r4,r4,r6 @ dst += vmScreenWidth
|
||||
SUBS r0,r0,#1
|
||||
BGT yLoop
|
||||
ADD r13,r13,#4
|
||||
end:
|
||||
LDMFD r13!,{r4-r7,r9-r11,PC}
|
||||
|
||||
LDMFD r13!,{r4-r11,PC}
|
||||
|
||||
@ ARM implementation of asmCopy8Col
|
||||
@
|
||||
@ C prototype would be:
|
||||
@ -156,4 +145,4 @@ roll2:
|
||||
STR r14,[r0],r1
|
||||
BNE yLoop2
|
||||
|
||||
LDMFD r13!,{PC}
|
||||
LDMFD r13!,{PC}
|
||||
|
Loading…
Reference in New Issue
Block a user