nv2a: Lazily synchronize surface data to/from RAM

2024-11-30 06:50:57 +00:00 · 2020-10-19 00:47:45 -07:00 · 2020-10-19 00:47:45 -07:00 · d6e52a02c4
commit d6e52a02c4
parent 33728b060f
12 changed files with 1490 additions and 359 deletions
--- a/hw/xbox/nv2a/debug.h
+++ b/hw/xbox/nv2a/debug.h
@ -21,6 +21,12 @@
 #ifndef HW_NV2A_DEBUG_H
 #define HW_NV2A_DEBUG_H

+#define NV2A_XPRINTF(x, ...) do { \
+    if (x) { \
+        fprintf(stderr, "nv2a: " __VA_ARGS__); \
+    } \
+} while (0)
+
 // #define DEBUG_NV2A
 #ifdef DEBUG_NV2A
 # define NV2A_DPRINTF(format, ...)       printf("nv2a: " format, ## __VA_ARGS__)
--- a/hw/xbox/nv2a/gl/gloffscreen_sdl.c
+++ b/hw/xbox/nv2a/gl/gloffscreen_sdl.c
@ -36,16 +36,9 @@ struct _GloContext {
    SDL_GLContext gl_context;
 };

-static GloContext *g_context;
-
 /* Create an OpenGL context */
 GloContext *glo_context_create(void)
 {
-    if (g_context) {
-        glo_set_current(g_context);
-        return g_context;
-    }
-
    GloContext *context = (GloContext *)malloc(sizeof(GloContext));
    assert(context != NULL);

@ -87,8 +80,6 @@ GloContext *glo_context_create(void)

    glo_set_current(context);

-    g_context = context;
-
    return context;
 }

--- a/hw/xbox/nv2a/nv2a.c
+++ b/hw/xbox/nv2a/nv2a.c
@ -346,6 +346,7 @@ static void nv2a_init_memory(NV2AState *d, MemoryRegion *ram)
    d->ramin_ptr = memory_region_get_ram_ptr(&d->ramin);

    memory_region_set_log(d->vram, true, DIRTY_MEMORY_NV2A);
+    memory_region_set_log(d->vram, true, DIRTY_MEMORY_NV2A_TEX);
    memory_region_set_dirty(d->vram, 0, memory_region_size(d->vram));

    /* hacky. swap out vga's vram */
@ -401,7 +402,7 @@ static void nv2a_reset(NV2AState *d)
    d->pgraph.waiting_for_flip = false;
    d->pgraph.waiting_for_fifo_access = false;
    d->pgraph.waiting_for_context_switch = false;
-    d->pgraph.flush_pending = false;
+    d->pgraph.flush_pending = true;

    d->pmc.pending_interrupts = 0;
    d->pfifo.pending_interrupts = 0;
@ -479,6 +480,7 @@ static void nv2a_vm_state_change(void *opaque, int running, RunState state)
 {
    NV2AState *d = opaque;
    if (state == RUN_STATE_SAVE_VM) {
+        // FIXME: writeback all surfaces to RAM before snapshot
        nv2a_lock_fifo(d);
    } else if (state == RUN_STATE_RESTORE_VM) {
        nv2a_reset(d); // Early reset to avoid changing any state during load
--- a/hw/xbox/nv2a/nv2a.h
+++ b/hw/xbox/nv2a/nv2a.h
@ -2,6 +2,7 @@
 * QEMU Geforce NV2A implementation
 *
 * Copyright (c) 2012 espes
+ * Copyright (c) 2020 Matt Borgerson
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
@ -21,5 +22,7 @@
 #define HW_NV2A_H

 void nv2a_init(PCIBus *bus, int devfn, MemoryRegion *ram);
+void nv2a_gl_context_init(void);
+int nv2a_get_framebuffer_surface(void);

 #endif
--- a/hw/xbox/nv2a/nv2a_int.h
+++ b/hw/xbox/nv2a/nv2a_int.h
@ -26,6 +26,7 @@

 #include "qemu/osdep.h"
 #include "qemu/thread.h"
+#include "qemu/queue.h"
 #include "qemu/main-loop.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
@ -129,6 +130,37 @@ typedef struct SurfaceShape {
    unsigned int anti_aliasing;
 } SurfaceShape;

+typedef struct SurfaceBinding {
+    QTAILQ_ENTRY(SurfaceBinding) entry;
+    MemAccessCallback *access_cb;
+
+    hwaddr vram_addr;
+
+    SurfaceShape shape;
+    uintptr_t dma_addr;
+    uintptr_t dma_len;
+    bool color;
+    bool swizzle;
+
+    unsigned int width;
+    unsigned int height;
+    unsigned int pitch;
+    unsigned int bytes_per_pixel;
+    size_t size;
+
+    GLenum gl_attachment;
+    GLenum gl_internal_format;
+    GLenum gl_format;
+    GLenum gl_type;
+    GLuint gl_buffer;
+
+    int frame_time;
+    int draw_time;
+    bool draw_dirty;
+    bool download_pending;
+    bool upload_pending;
+} SurfaceBinding;
+
 typedef struct TextureShape {
    bool cubemap;
    unsigned int dimensionality;
@ -144,14 +176,20 @@ typedef struct TextureBinding {
    GLenum gl_target;
    GLuint gl_texture;
    unsigned int refcnt;
+    int draw_time;
+    uint64_t data_hash;
 } TextureBinding;

 typedef struct TextureKey {
    struct lru_node node;
    TextureShape state;
-    uint8_t *texture_data;
-    uint8_t *palette_data;
    TextureBinding *binding;
+
+    hwaddr texture_vram_offset;
+    hwaddr texture_length;
+    hwaddr palette_vram_offset;
+    hwaddr palette_length;
+    bool possibly_dirty;
 } TextureKey;

 typedef struct KelvinState {
@ -182,6 +220,19 @@ typedef struct PGRAPHState {
    uint32_t pending_interrupts;
    uint32_t enabled_interrupts;

+    int frame_time;
+    int draw_time;
+
+    struct s2t_rndr {
+        GLuint fbo, vao, vbo, prog;
+        GLuint tex_loc, surface_size_loc;
+    } s2t_rndr;
+
+    struct disp_rndr {
+        GLuint fbo, vao, vbo, prog;
+        GLuint tex_loc;
+    } disp_rndr;
+
    /* subchannels state we're not sure the location of... */
    ContextSurfaces2DState context_surfaces_2d;
    ImageBlitState image_blit;
@ -192,6 +243,17 @@ typedef struct PGRAPHState {
    unsigned int surface_type;
    SurfaceShape surface_shape;
    SurfaceShape last_surface_shape;
+    QTAILQ_HEAD(, SurfaceBinding) surfaces;
+    SurfaceBinding *color_binding, *zeta_binding;
+    struct {
+        int clip_x;
+        int clip_width;
+        int clip_y;
+        int clip_height;
+        int width;
+        int height;
+    } surface_binding_dim; // FIXME: Refactor
+    bool downloads_pending;

    hwaddr dma_a, dma_b;
    struct lru texture_cache;
@ -207,9 +269,8 @@ typedef struct PGRAPHState {
    /* FIXME: Move to NV_PGRAPH_BUMPMAT... */
    float bump_env_matrix[NV2A_MAX_TEXTURES - 1][4]; /* 3 allowed stages with 2x2 matrix each */

-    GloContext *gl_context;
    GLuint gl_framebuffer;
-    GLuint gl_color_buffer, gl_zeta_buffer;
+    GLuint gl_display_buffer;

    hwaddr dma_state;
    hwaddr dma_notifies;
@ -275,6 +336,7 @@ typedef struct PGRAPHState {
    bool waiting_for_fifo_access;
    bool waiting_for_context_switch;
    bool flush_pending;
+    bool gl_sync_pending;
 } PGRAPHState;

 typedef struct NV2AState {
@ -362,6 +424,9 @@ typedef struct NV2ABlockInfo {
    MemoryRegionOps ops;
 } NV2ABlockInfo;

+extern GloContext *g_nv2a_context_render;
+extern GloContext *g_nv2a_context_display;
+
 void nv2a_update_irq(NV2AState *d);

 #ifdef NV2A_DEBUG
@ -406,6 +471,8 @@ void pgraph_destroy(PGRAPHState *pg);
 void pgraph_context_switch(NV2AState *d, unsigned int channel_id);
 void pgraph_method(NV2AState *d, unsigned int subchannel,
                   unsigned int method, uint32_t parameter);
+void pgraph_gl_sync(NV2AState *d);
+void pgraph_process_pending_downloads(NV2AState *d);

 void *pfifo_thread(void *arg);
 void pfifo_kick(NV2AState *d);
--- a/hw/xbox/nv2a/pfifo.c
+++ b/hw/xbox/nv2a/pfifo.c
@ -504,12 +504,24 @@ static void pfifo_run_pusher(NV2AState *d)
 void *pfifo_thread(void *arg)
 {
    NV2AState *d = (NV2AState *)arg;
-    glo_set_current(d->pgraph.gl_context);
+    glo_set_current(g_nv2a_context_render);
+
+    rcu_register_thread();

    qemu_mutex_lock(&d->pfifo.lock);
    while (true) {
        d->pfifo.fifo_kick = false;

+        if (atomic_read(&d->pgraph.downloads_pending)) {
+            pgraph_process_pending_downloads(d);
+            atomic_set(&d->pgraph.downloads_pending, false);
+        }
+
+        if (atomic_read(&d->pgraph.gl_sync_pending)) {
+            pgraph_gl_sync(d);
+            atomic_set(&d->pgraph.gl_sync_pending, false);
+        }
+
        pfifo_run_pusher(d);
        pfifo_run_puller(d);

@ -526,6 +538,8 @@ void *pfifo_thread(void *arg)
    }
    qemu_mutex_unlock(&d->pfifo.lock);

+    rcu_unregister_thread();
+
    return NULL;
 }

--- a/hw/xbox/nv2a/pgraph.c
+++ b/hw/xbox/nv2a/pgraph.c
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@ -237,11 +237,12 @@ static inline bool cpu_physical_memory_get_dirty_flag(ram_addr_t addr,
 static inline bool cpu_physical_memory_is_clean(ram_addr_t addr)
 {
    bool nv2a = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_NV2A);
+    bool nv2a_tex = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_NV2A_TEX);
    bool vga = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_VGA);
    bool code = cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_CODE);
    bool migration =
        cpu_physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_MIGRATION);
-    return !(nv2a && vga && code && migration);
+    return !(nv2a && nv2a_tex && vga && code && migration);
 }

 static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start,
@ -254,6 +255,10 @@ static inline uint8_t cpu_physical_memory_range_includes_clean(ram_addr_t start,
        !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_NV2A)) {
        ret |= (1 << DIRTY_MEMORY_NV2A);
    }
+    if (mask & (1 << DIRTY_MEMORY_NV2A_TEX) &&
+        !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_NV2A_TEX)) {
+        ret |= (1 << DIRTY_MEMORY_NV2A_TEX);
+    }
    if (mask & (1 << DIRTY_MEMORY_VGA) &&
        !cpu_physical_memory_all_dirty(start, length, DIRTY_MEMORY_VGA)) {
        ret |= (1 << DIRTY_MEMORY_VGA);
@ -331,6 +336,10 @@ static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
                bitmap_set_atomic(blocks[DIRTY_MEMORY_NV2A]->blocks[idx],
                                  offset, next - page);
            }
+            if (unlikely(mask & (1 << DIRTY_MEMORY_NV2A_TEX))) {
+                bitmap_set_atomic(blocks[DIRTY_MEMORY_NV2A_TEX]->blocks[idx],
+                                  offset, next - page);
+            }

            page = next;
            idx++;
@ -379,6 +388,7 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,

                    atomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
                    atomic_or(&blocks[DIRTY_MEMORY_NV2A][idx][offset], temp);
+                    atomic_or(&blocks[DIRTY_MEMORY_NV2A_TEX][idx][offset], temp);

                    if (global_dirty_log) {
                        atomic_or(&blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
@ -445,6 +455,7 @@ static inline void cpu_physical_memory_clear_dirty_range(ram_addr_t start,
    cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_MIGRATION);
    cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_VGA);
    cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_NV2A);
+    cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_NV2A_TEX);
    cpu_physical_memory_test_and_clear_dirty(start, length, DIRTY_MEMORY_CODE);
 }

--- a/include/exec/ramlist.h
+++ b/include/exec/ramlist.h
@ -12,7 +12,8 @@ typedef struct RAMBlockNotifier RAMBlockNotifier;
 #define DIRTY_MEMORY_CODE      1
 #define DIRTY_MEMORY_MIGRATION 2
 #define DIRTY_MEMORY_NV2A      3
-#define DIRTY_MEMORY_NUM       4        /* num of dirty bits */
+#define DIRTY_MEMORY_NV2A_TEX  4
+#define DIRTY_MEMORY_NUM       5        /* num of dirty bits */

 /* The dirty memory bitmap is split into fixed-size blocks to allow growth
 * under RCU.  The bitmap for a block can be accessed as follows:
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@ -1981,20 +1981,26 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
 void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
 {
    uint8_t mask = 1 << client;
-    uint8_t old_logging;

+#ifdef XBOX
+    assert((client == DIRTY_MEMORY_VGA) \
+        || (client == DIRTY_MEMORY_NV2A) \
+        || (client == DIRTY_MEMORY_NV2A_TEX));
    if (mr->alias) {
        memory_region_set_log(mr->alias, log, client);
        return;
    }
+    mr->vga_logging_count += log ? 1 : -1;
+#else
+    uint8_t old_logging;

-    assert((client == DIRTY_MEMORY_VGA) \
-        || (client == DIRTY_MEMORY_NV2A));
+    assert(client == DIRTY_MEMORY_VGA);
    old_logging = mr->vga_logging_count;
    mr->vga_logging_count += log ? 1 : -1;
    if (!!old_logging == !!mr->vga_logging_count) {
        return;
    }
+#endif

    memory_region_transaction_begin();
    mr->dirty_log_mask = (mr->dirty_log_mask & ~mask) | (log * mask);
--- a/ui/xemu-shaders.c
+++ b/ui/xemu-shaders.c
@ -301,8 +301,14 @@ struct fbo *create_fbo(int width, int height)
    return fbo;
 }

+static GLboolean m_blend;
+
 void render_to_default_fb(void)
 {
+    if (!m_blend) {
+        glDisable(GL_BLEND);
+    }
+
    // Restore default framebuffer, viewport, blending funciton
    glBindFramebuffer(GL_FRAMEBUFFER, main_fb);
    glViewport(vp[0], vp[1], vp[2], vp[3]);
@ -311,10 +317,13 @@ void render_to_default_fb(void)

 GLuint render_to_fbo(struct fbo *fbo)
 {
-    glEnable(GL_BLEND);
+    m_blend = glIsEnabled(GL_BLEND);
+    if (!m_blend) {
+        glEnable(GL_BLEND);
+    }
    glBindFramebuffer(GL_FRAMEBUFFER, fbo->fbo);
    glViewport(0, 0, fbo->w, fbo->h);
-    glClearColor(0, 0, 0, 0.0);
+    glClearColor(0, 0, 0, 0);
    glClear(GL_COLOR_BUFFER_BIT);
    return fbo->tex;
 }
--- a/ui/xemu.c
+++ b/ui/xemu.c
@ -32,6 +32,10 @@
 #include "qemu/thread.h"
 #include "qemu/main-loop.h"
 #include "qemu-version.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-block.h"
+#include "qapi/qmp/qdict.h"
 #include "ui/console.h"
 #include "ui/input.h"
 #include "ui/xemu-display.h"
@ -41,13 +45,9 @@
 #include "xemu-input.h"
 #include "xemu-settings.h"
 #include "xemu-shaders.h"
-#include "hw/xbox/nv2a/gl/gloffscreen.h" // FIXME

-#include "qemu-common.h"
-#include "qapi/error.h"
-#include "qapi/qapi-commands-block.h"
-#include "qapi/qmp/qdict.h"
 #include "hw/xbox/smbus.h" // For eject, drive tray
+#include "hw/xbox/nv2a/nv2a.h"

 // #define DEBUG_XEMU_C

@ -61,8 +61,6 @@ void xb_surface_gl_create_texture(DisplaySurface *surface);
 void xb_surface_gl_update_texture(DisplaySurface *surface, int x, int y, int w, int h);
 void xb_surface_gl_destroy_texture(DisplaySurface *surface);

-static void pre_swap(void);
-static void post_swap(void);
 static void sleep_ns(int64_t ns);

 static int sdl2_num_outputs;
@ -838,7 +836,7 @@ static void sdl2_display_very_early_init(DisplayOptions *o)
    }

    // Initialize offscreen rendering context now
-    glo_context_create();
+    nv2a_gl_context_init();
    SDL_GL_MakeCurrent(NULL, NULL);

    // FIXME: atexit(sdl_cleanup);
@ -1029,88 +1027,13 @@ static void sdl2_set_scanout_mode(struct sdl2_console *scon, bool scanout)
 }
 #endif

-static void xemu_sdl2_gl_render_surface(struct sdl2_console *scon)
-{
-    int ww, wh;
-
-    SDL_GL_MakeCurrent(scon->real_window, scon->winctx);
-    // sdl2_set_scanout_mode(scon, false);
-    SDL_GL_GetDrawableSize(scon->real_window, &ww, &wh);
-
-    // Get texture dimensions
-    int tw, th;
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, scon->surface->texture);
-    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &tw);
-    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &th);
-
-    // Calculate scaling factors
-    float scale[2];
-
-    if (scaling_mode == DISPLAY_SCALE_STRETCH) {
-        // Stretch to fit
-        scale[0] = 1.0;
-        scale[1] = 1.0;
-    } else if (scaling_mode == DISPLAY_SCALE_CENTER) {
-        // Centered
-        scale[0] = (float)tw/(float)ww;
-        scale[1] = (float)th/(float)wh;
-    } else {
-        // Scale to fit
-        float t_ratio = (float)tw/(float)th;
-        float w_ratio = (float)ww/(float)wh;
-        if (w_ratio >= t_ratio) {
-            scale[0] = t_ratio/w_ratio;
-            scale[1] = 1.0;
-        } else {
-            scale[0] = 1.0;
-            scale[1] = w_ratio/t_ratio;
-        }
-    }
-
-    struct decal_shader *s = blit;
-    s->flip = 1;
-
-    glViewport(0, 0, ww, wh);
-    glUseProgram(s->prog);
-    glBindVertexArray(s->vao);
-    glUniform1i(s->FlipY_loc, s->flip);
-    glUniform4f(s->ScaleOffset_loc, scale[0], scale[1], 0, 0);
-    glUniform4f(s->TexScaleOffset_loc, 1.0, 1.0, 0, 0);
-    glUniform1i(s->tex_loc, 0);
-
-    glClearColor(0, 0, 0, 0);
-    glClear(GL_COLOR_BUFFER_BIT);
-    glDrawElements(GL_TRIANGLE_FAN, 4, GL_UNSIGNED_INT, NULL);
-
-    // FIXME: Finer locking
-    qemu_mutex_lock_main_loop();
-    qemu_mutex_lock_iothread();
-    xemu_hud_render();
-    qemu_mutex_unlock_iothread();
-    qemu_mutex_unlock_main_loop();
-
-    // xb_surface_gl_render_texture(scon->surface);
-    pre_swap();
-    SDL_GL_SwapWindow(scon->real_window);
-    post_swap();
-}
-
 void sdl2_gl_update(DisplayChangeListener *dcl,
                    int x, int y, int w, int h)
 {
    struct sdl2_console *scon = container_of(dcl, struct sdl2_console, dcl);
    assert(scon->opengl);
-#if 1
+
    SDL_GL_MakeCurrent(scon->real_window, scon->winctx);
-    if (scon->surface) {
-        // glDeleteTextures(1, &scon->surface->texture);
-        xb_surface_gl_destroy_texture(scon->surface);
-        // assert(glGetError() == GL_NO_ERROR);
-    }
-    xb_surface_gl_create_texture(scon->surface);
-#endif
-    scon->updates++;
 }

 void sdl2_gl_switch(DisplayChangeListener *dcl,
@ -1122,7 +1045,6 @@ void sdl2_gl_switch(DisplayChangeListener *dcl,
    assert(scon->opengl);

    SDL_GL_MakeCurrent(scon->real_window, scon->winctx);
-    xb_surface_gl_destroy_texture(scon->surface);

    scon->surface = new_surface;

@ -1145,8 +1067,6 @@ void sdl2_gl_switch(DisplayChangeListener *dcl,
                (surface_height(old_surface) != surface_height(new_surface)))) {
        // sdl2_window_resize(scon);
    }
-
-    xb_surface_gl_create_texture(scon->surface);
 }

 float fps = 1.0;
@ -1168,22 +1088,149 @@ void sdl2_gl_refresh(DisplayChangeListener *dcl)
 {
    struct sdl2_console *scon = container_of(dcl, struct sdl2_console, dcl);
    assert(scon->opengl);
+    bool flip_required = false;

    update_fps();

-    SDL_GL_MakeCurrent(scon->real_window, scon->winctx);
+    /* XXX: Note that this bypasses the usual VGA path in order to quickly
+     * get the surface. This is simple and fast, at the cost of accuracy.
+     * Ideally, this should go through the VGA code and opportunistically pull
+     * the surface like this, but handle the VGA logic as well. For now, just
+     * use this fast path to handle the common case.
+     *
+     * In the event the surface is not found in the surface cache, e.g. when
+     * the guest code isn't using HW accelerated rendering, but just blitting
+     * to the framebuffer, fall back to the VGA path.
+     */
+    GLuint tex = nv2a_get_framebuffer_surface();
+    if (tex == 0) {
+        if (scon->surface) {
+            xb_surface_gl_destroy_texture(scon->surface);
+        }
+        xb_surface_gl_create_texture(scon->surface);
+        scon->updates++;
+        tex = scon->surface->texture;
+        flip_required = true;
+    }

+    /* FIXME: Finer locking. Event handlers in segments of the code expect
+     * to be running on the main thread with the BQL. For now, acquire the
+     * lock and perform rendering, but release before swap to avoid
+     * possible lengthy blocking (for vsync).
+     */
    qemu_mutex_lock_main_loop();
    qemu_mutex_lock_iothread();
-    graphic_hw_update(dcl->con);
+    sdl2_poll_events(scon);

+    SDL_GL_MakeCurrent(scon->real_window, scon->winctx);
+
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, tex);
+
+    // Get texture dimensions
+    int tw, th;
+    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &tw);
+    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &th);
+
+    // Get window dimensions
+    int ww, wh;
+    SDL_GL_GetDrawableSize(scon->real_window, &ww, &wh);
+
+    // Calculate scaling factors
+    float scale[2];
+    if (scaling_mode == DISPLAY_SCALE_STRETCH) {
+        // Stretch to fit
+        scale[0] = 1.0;
+        scale[1] = 1.0;
+    } else if (scaling_mode == DISPLAY_SCALE_CENTER) {
+        // Centered
+        scale[0] = (float)tw/(float)ww;
+        scale[1] = (float)th/(float)wh;
+    } else {
+        // Scale to fit
+        float t_ratio = (float)tw/(float)th;
+        float w_ratio = (float)ww/(float)wh;
+        if (w_ratio >= t_ratio) {
+            scale[0] = t_ratio/w_ratio;
+            scale[1] = 1.0;
+        } else {
+            scale[0] = 1.0;
+            scale[1] = w_ratio/t_ratio;
+        }
+    }
+
+    // Render framebuffer and GUI
+    struct decal_shader *s = blit;
+    s->flip = flip_required;
+    glViewport(0, 0, ww, wh);
+    glUseProgram(s->prog);
+    glBindVertexArray(s->vao);
+    glUniform1i(s->FlipY_loc, s->flip);
+    glUniform4f(s->ScaleOffset_loc, scale[0], scale[1], 0, 0);
+    glUniform4f(s->TexScaleOffset_loc, 1.0, 1.0, 0, 0);
+    glUniform1i(s->tex_loc, 0);
+    glClearColor(0, 0, 0, 0);
+    glClear(GL_COLOR_BUFFER_BIT);
+    glDrawElements(GL_TRIANGLE_FAN, 4, GL_UNSIGNED_INT, NULL);
+    xemu_hud_render();
+
+    // GLsync fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    // int result = glClientWaitSync(fence, GL_SYNC_FLUSH_COMMANDS_BIT, (GLuint64)(5000000000));
+    // assert(result == GL_CONDITION_SATISFIED || result == GL_ALREADY_SIGNALED);
+    // glDeleteSync(fence);
+
+    // Release BQL before swapping (which may sleep)
+    qemu_mutex_unlock_iothread();
+    qemu_mutex_unlock_main_loop();
+
+    SDL_GL_SwapWindow(scon->real_window);
+
+    /* VGA update (see note above) + vblank */
+    qemu_mutex_lock_main_loop();
+    qemu_mutex_lock_iothread();
+    graphic_hw_update(scon->dcl.con);
    if (scon->updates && scon->surface) {
        scon->updates = 0;
    }
-    sdl2_poll_events(scon);
    qemu_mutex_unlock_iothread();
    qemu_mutex_unlock_main_loop();
-    xemu_sdl2_gl_render_surface(scon);
+
+    /*
+     * Throttle to make sure swaps happen at 60Hz
+     */
+    static int64_t last_update = 0;
+    int64_t deadline = last_update + 16666666;
+
+    int64_t sleep_acc = 0;
+    int64_t spin_acc = 0;
+
+#ifndef _WIN32
+    const int64_t sleep_threshold = 2000000;
+#else
+    const int64_t sleep_threshold = 250000;
+#endif
+
+    while (1) {
+        int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+        int64_t time_remaining = deadline - now;
+        if (now < deadline) {
+            if (time_remaining > sleep_threshold) {
+                // Try to sleep until the until reaching the sleep threshold.
+                sleep_ns(time_remaining - sleep_threshold);
+                sleep_acc += qemu_clock_get_ns(QEMU_CLOCK_REALTIME)-now;
+            } else {
+                // Simply spin to avoid extra delays incurred with swapping to
+                // another process and back in the event of being within
+                // threshold to desired event.
+                spin_acc++;
+            }
+        } else {
+            DPRINTF("zzZz %g %ld\n", (double)sleep_acc/1000000.0, spin_acc);
+            last_update = now;
+            break;
+        }
+    }
+
 }

 void sdl2_gl_redraw(struct sdl2_console *scon)
@ -1208,7 +1255,7 @@ QEMUGLContext sdl2_gl_create_context(DisplayChangeListener *dcl,
    SDL_GLContext ctx;

    assert(0);
-    
+
    assert(scon->opengl);

    SDL_GL_MakeCurrent(scon->real_window, scon->winctx);
@ -1286,7 +1333,7 @@ void sdl2_gl_scanout_texture(DisplayChangeListener *dcl,
                             uint32_t w, uint32_t h)
 {
    assert(0);
-#if 0 
+#if 0
    struct sdl2_console *scon = container_of(dcl, struct sdl2_console, dcl);

    assert(scon->opengl);
@ -1374,10 +1421,6 @@ static void *call_qemu_main(void *opaque)
    exit(status);
 }

-static void pre_swap(void)
-{
-}
-
 /* Note: only supports millisecond resolution on Windows */
 static void sleep_ns(int64_t ns)
 {
@ -1391,42 +1434,6 @@ static void sleep_ns(int64_t ns)
 #endif
 }

-static void post_swap(void)
-{
-    // Throttle to make sure swaps happen at 60Hz
-    static int64_t last_update = 0;
-    int64_t deadline = last_update + 16666666;
-    int64_t sleep_acc = 0;
-    int64_t spin_acc = 0;
-
-#ifndef _WIN32
-    const int64_t sleep_threshold = 2000000;
-#else
-    const int64_t sleep_threshold = 250000;
-#endif
-
-    while (1) {
-        int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-        int64_t time_remaining = deadline - now;
-        if (now < deadline) {
-            if (time_remaining > sleep_threshold) {
-                // Try to sleep until the until reaching the sleep threshold.
-                sleep_ns(time_remaining - sleep_threshold);
-                sleep_acc += qemu_clock_get_ns(QEMU_CLOCK_REALTIME)-now;
-            } else {
-                // Simply spin to avoid extra delays incurred with swapping to
-                // another process and back in the event of being within
-                // threshold to desired event.
-                spin_acc++;
-            }
-        } else {
-            DPRINTF("zzZz %g %ld\n", (double)sleep_acc/1000000.0, spin_acc);
-            last_update = now;
-            break;
-        }
-    }
-}
-
 int main(int argc, char **argv)
 {
    QemuThread thread;