GRAPHICS: ATARI: Align surface on a 16-byte boundary

Also implement a CPU-based optimization for the 68040 / 68060.
2025-03-05 17:57:14 +00:00 · 2023-03-05 14:30:36 +01:00 · 2023-03-05 14:30:36 +01:00 · 06af761337
commit 06af761337
parent d3be1b0c34
2 changed files with 167 additions and 17 deletions
--- a/backends/graphics/atari/atari-graphics.cpp
+++ b/backends/graphics/atari/atari-graphics.cpp
@ -1001,7 +1001,7 @@ void AtariGraphicsManager::Cursor::setSurface(const void *buf, int w, int h, int
 	if (surface.w != w || surface.h != h || surface.format != format)
 		surface.create(w, h, format);

-	surface.copyRectToSurface(buf, surface.pitch, 0, 0, w, h);
+	surface.copyRectToSurface(buf, w * format.bytesPerPixel, 0, 0, w, h);

 	hotspotX = _hotspotX;
 	hotspotY = _hotspotY;
--- a/graphics/blit-atari.cpp
+++ b/graphics/blit-atari.cpp
@ -22,15 +22,24 @@
 #include "graphics/blit.h"
 #include "graphics/surface.h"

-#include <cstdlib>	// calloc
-#include <cstring>	// memcpy
+#include <cstdlib>	// malloc
+#include <cstring>	// memcpy, memset
 #include <mint/cookie.h>
 #include <mint/falcon.h>

 #include "backends/graphics/atari/atari-graphics-superblitter.h"
+#include "common/textconsole.h"	// error
+
+static inline bool hasMove16() {
+	long val;
+	static bool hasMove16 = Getcookie(C__CPU, &val) == C_FOUND && val >= 40;
+	return hasMove16;
+}

 namespace Graphics {

+constexpr size_t ALIGN = 16;	// 16 bytes
+
 // hijack surface overrides here as well as these are tightly related
 // to the blitting routine below
 void Surface::create(int16 width, int16 height, const PixelFormat &f) {
@ -40,24 +49,41 @@ void Surface::create(int16 width, int16 height, const PixelFormat &f) {
 	w = width;
 	h = height;
 	format = f;
-	pitch = w * format.bytesPerPixel;
+	// align pitch to a 16-byte boundary for a possible C2P conversion
+	pitch = (w * format.bytesPerPixel + ALIGN - 1) & (-ALIGN);

 	if (width && height) {
-		if (VgetMonitor() == MON_VGA && Getcookie(C_SupV, NULL) == C_FOUND)
-			pixels = (void*)ct60_vmalloc(width * height * format.bytesPerPixel);
-		else
-			pixels = calloc(width * height, format.bytesPerPixel);
-		assert(pixels);
+		if (VgetMonitor() == MON_VGA && Getcookie(C_SupV, NULL) == C_FOUND) {
+			pixels = (void *)ct60_vmalloc(height * pitch);
+
+			if (!pixels)
+				error("Not enough SVRAM to allocate a surface");
+
+			assert((uintptr)pixels >= 0xA0000000);
+		} else {
+			// align buffer to a 16-byte boundary for move16 or C2P conversion
+			void *pixelsUnaligned = ::malloc(sizeof(uintptr) + (height * pitch) + ALIGN - 1);
+
+			if (!pixelsUnaligned)
+				error("Not enough memory to allocate a surface");
+
+			pixels = (void *)(((uintptr)pixelsUnaligned + sizeof(uintptr) + ALIGN - 1) & (-ALIGN));
+
+			// store the unaligned pointer for later free()
+			*((uintptr *)pixels - 1) = (uintptr)pixelsUnaligned;
+		}
+
+		memset(pixels, 0, height * pitch);
 	}
 }

 void Surface::free() {
 	if (((uintptr)pixels & 0xFF000000) >= 0xA0000000)
 		ct60_vmfree(pixels);
-	else
-		::free(pixels);
+	else if (pixels)
+		::free((void *)*((uintptr *)pixels - 1));

-	pixels = 0;
+	pixels = nullptr;
 	w = h = pitch = 0;
 	format = PixelFormat();
 }
@ -87,12 +113,136 @@ void copyBlit(byte *dst, const byte *src,
 		// wait until we finish otherwise we may overwrite pixels written manually afterwards
 		while (*SV_BLITTER_CONTROL & 1);
 	} else if (dstPitch == srcPitch && ((w * bytesPerPixel) == dstPitch)) {
-		memcpy(dst, src, dstPitch * h);
+		if (hasMove16() && ((uintptr)src & (ALIGN - 1)) == 0 && ((uintptr)dst & (ALIGN - 1)) == 0) {
+			__asm__ volatile(
+			"	move.l	%2,d0\n"
+			"	lsr.l	#4,d0\n"
+			"	beq.b	3f\n"
+
+			"	moveq	#0x0f,d1\n"
+			"	and.l	d0,d1\n"
+			"	neg.l	d1\n"
+			"	lsr.l	#4,d0\n"
+			"	jmp	(2f,pc,d1.l*4)\n"
+			"1:\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"2:\n"
+			"	dbra	d0,1b\n"
+			// handle also the unlikely case when 'dstPitch'
+			// is not divisible by 16 but 'src' and 'dst' are
+			"3:\n"
+			"	moveq	#0x0f,d0\n"
+			"	and.l	%2,d0\n"
+			"	neg.l	d0\n"
+			"	jmp	(4f,pc,d0.l*2)\n"
+			// only 15x move.b as 16 would be handled above
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"4:\n"
+				: // outputs
+				: "a"(src), "a"(dst), "g"(dstPitch * h) // inputs
+				: "d0", "d1", "cc" AND_MEMORY
+			);
+		} else {
+			memcpy(dst, src, dstPitch * h);
+		}
 	} else {
-		for (uint i = 0; i < h; ++i) {
-			memcpy(dst, src, w * bytesPerPixel);
-			dst += dstPitch;
-			src += srcPitch;
+		if (hasMove16() && ((uintptr)src & (ALIGN - 1)) == 0 && ((uintptr)dst & (ALIGN - 1)) == 0) {
+			__asm__ volatile(
+			"0:\n"
+			"	move.l	%2,d0\n"
+			"	lsr.l	#4,d0\n"
+			"	beq.b	3f\n"
+
+			"	moveq	#0x0f,d1\n"
+			"	and.l	d0,d1\n"
+			"	neg.l	d1\n"
+			"	lsr.l	#4,d0\n"
+			"	jmp	(2f,pc,d1.l*4)\n"
+			"1:\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"2:\n"
+			"	dbra	d0,1b\n"
+			// handle (w * bytesPerPixel) % 16
+			"3:\n"
+			"	moveq	#0x0f,d0\n"
+			"	and.l	%2,d0\n"
+			"	neg.l	d0\n"
+			"	jmp	(4f,pc,d0.l*2)\n"
+			// only 15x move.b as 16 would be handled above
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"4:\n"
+			"	add.l	%4,%1\n"
+			"	add.l	%5,%0\n"
+			"	dbra	%3,0b\n"
+				: // outputs
+				: "a"(src), "a"(dst), "g"(w * bytesPerPixel), "d"(h - 1),
+				  "g"(dstPitch - w * bytesPerPixel), "g"(srcPitch - w * bytesPerPixel) // inputs
+				: "d0", "d1", "d2", "cc" AND_MEMORY
+			);
+		} else {
+			for (uint i = 0; i < h; ++i) {
+				memcpy(dst, src, w * bytesPerPixel);
+				dst += dstPitch;
+				src += srcPitch;
+			}
 		}
 	}
 }