GRAPHICS: ATARI: Align surface on a 16-byte boundary

Also implement a CPU-based optimization for the 68040 / 68060.
This commit is contained in:
Miro Kropacek 2023-03-05 14:30:36 +01:00 committed by Miro Kropáček
parent d3be1b0c34
commit 06af761337
2 changed files with 167 additions and 17 deletions

View File

@ -1001,7 +1001,7 @@ void AtariGraphicsManager::Cursor::setSurface(const void *buf, int w, int h, int
if (surface.w != w || surface.h != h || surface.format != format)
surface.create(w, h, format);
surface.copyRectToSurface(buf, surface.pitch, 0, 0, w, h);
surface.copyRectToSurface(buf, w * format.bytesPerPixel, 0, 0, w, h);
hotspotX = _hotspotX;
hotspotY = _hotspotY;

View File

@ -22,15 +22,24 @@
#include "graphics/blit.h"
#include "graphics/surface.h"
#include <cstdlib> // calloc
#include <cstring> // memcpy
#include <cstdlib> // malloc
#include <cstring> // memcpy, memset
#include <mint/cookie.h>
#include <mint/falcon.h>
#include "backends/graphics/atari/atari-graphics-superblitter.h"
#include "common/textconsole.h" // error
static inline bool hasMove16() {
long val;
static bool hasMove16 = Getcookie(C__CPU, &val) == C_FOUND && val >= 40;
return hasMove16;
}
namespace Graphics {
constexpr size_t ALIGN = 16; // 16 bytes
// hijack surface overrides here as well as these are tightly related
// to the blitting routine below
void Surface::create(int16 width, int16 height, const PixelFormat &f) {
@ -40,24 +49,41 @@ void Surface::create(int16 width, int16 height, const PixelFormat &f) {
w = width;
h = height;
format = f;
pitch = w * format.bytesPerPixel;
// align pitch to a 16-byte boundary for a possible C2P conversion
pitch = (w * format.bytesPerPixel + ALIGN - 1) & (-ALIGN);
if (width && height) {
if (VgetMonitor() == MON_VGA && Getcookie(C_SupV, NULL) == C_FOUND)
pixels = (void*)ct60_vmalloc(width * height * format.bytesPerPixel);
else
pixels = calloc(width * height, format.bytesPerPixel);
assert(pixels);
if (VgetMonitor() == MON_VGA && Getcookie(C_SupV, NULL) == C_FOUND) {
pixels = (void *)ct60_vmalloc(height * pitch);
if (!pixels)
error("Not enough SVRAM to allocate a surface");
assert((uintptr)pixels >= 0xA0000000);
} else {
// align buffer to a 16-byte boundary for move16 or C2P conversion
void *pixelsUnaligned = ::malloc(sizeof(uintptr) + (height * pitch) + ALIGN - 1);
if (!pixelsUnaligned)
error("Not enough memory to allocate a surface");
pixels = (void *)(((uintptr)pixelsUnaligned + sizeof(uintptr) + ALIGN - 1) & (-ALIGN));
// store the unaligned pointer for later free()
*((uintptr *)pixels - 1) = (uintptr)pixelsUnaligned;
}
memset(pixels, 0, height * pitch);
}
}
void Surface::free() {
if (((uintptr)pixels & 0xFF000000) >= 0xA0000000)
ct60_vmfree(pixels);
else
::free(pixels);
else if (pixels)
::free((void *)*((uintptr *)pixels - 1));
pixels = 0;
pixels = nullptr;
w = h = pitch = 0;
format = PixelFormat();
}
@ -87,12 +113,136 @@ void copyBlit(byte *dst, const byte *src,
// wait until we finish otherwise we may overwrite pixels written manually afterwards
while (*SV_BLITTER_CONTROL & 1);
} else if (dstPitch == srcPitch && ((w * bytesPerPixel) == dstPitch)) {
memcpy(dst, src, dstPitch * h);
if (hasMove16() && ((uintptr)src & (ALIGN - 1)) == 0 && ((uintptr)dst & (ALIGN - 1)) == 0) {
__asm__ volatile(
" move.l %2,d0\n"
" lsr.l #4,d0\n"
" beq.b 3f\n"
" moveq #0x0f,d1\n"
" and.l d0,d1\n"
" neg.l d1\n"
" lsr.l #4,d0\n"
" jmp (2f,pc,d1.l*4)\n"
"1:\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
"2:\n"
" dbra d0,1b\n"
// handle also the unlikely case when 'dstPitch'
// is not divisible by 16 but 'src' and 'dst' are
"3:\n"
" moveq #0x0f,d0\n"
" and.l %2,d0\n"
" neg.l d0\n"
" jmp (4f,pc,d0.l*2)\n"
// only 15x move.b as 16 would be handled above
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
"4:\n"
: // outputs
: "a"(src), "a"(dst), "g"(dstPitch * h) // inputs
: "d0", "d1", "cc" AND_MEMORY
);
} else {
memcpy(dst, src, dstPitch * h);
}
} else {
for (uint i = 0; i < h; ++i) {
memcpy(dst, src, w * bytesPerPixel);
dst += dstPitch;
src += srcPitch;
if (hasMove16() && ((uintptr)src & (ALIGN - 1)) == 0 && ((uintptr)dst & (ALIGN - 1)) == 0) {
__asm__ volatile(
"0:\n"
" move.l %2,d0\n"
" lsr.l #4,d0\n"
" beq.b 3f\n"
" moveq #0x0f,d1\n"
" and.l d0,d1\n"
" neg.l d1\n"
" lsr.l #4,d0\n"
" jmp (2f,pc,d1.l*4)\n"
"1:\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
"2:\n"
" dbra d0,1b\n"
// handle (w * bytesPerPixel) % 16
"3:\n"
" moveq #0x0f,d0\n"
" and.l %2,d0\n"
" neg.l d0\n"
" jmp (4f,pc,d0.l*2)\n"
// only 15x move.b as 16 would be handled above
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
"4:\n"
" add.l %4,%1\n"
" add.l %5,%0\n"
" dbra %3,0b\n"
: // outputs
: "a"(src), "a"(dst), "g"(w * bytesPerPixel), "d"(h - 1),
"g"(dstPitch - w * bytesPerPixel), "g"(srcPitch - w * bytesPerPixel) // inputs
: "d0", "d1", "d2", "cc" AND_MEMORY
);
} else {
for (uint i = 0; i < h; ++i) {
memcpy(dst, src, w * bytesPerPixel);
dst += dstPitch;
src += srcPitch;
}
}
}
}