gl_stream_buffer: optimize OpenGL buffer handling

This seems to give a huge performance boost for some Mali GPU devices.
2024-11-23 06:09:46 +00:00 · 2024-09-16 22:13:55 +02:00 · 2024-09-16 22:13:55 +02:00 · 107837f506
commit 107837f506
parent 7031479af0
4 changed files with 26 additions and 73 deletions
--- a/src/video_core/renderer_opengl/gl_driver.cpp
+++ b/src/video_core/renderer_opengl/gl_driver.cpp
@ -189,7 +189,7 @@ void Driver::FindBugs() {
    // TODO: Check if these have been fixed in the newer driver
    if (vendor == Vendor::AMD) {
-        bugs |= DriverBug::ShaderStageChangeFreeze | DriverBug::VertexArrayOutOfBound;
+        bugs |= DriverBug::ShaderStageChangeFreeze;
    }
    if (vendor == Vendor::AMD || (vendor == Vendor::Intel && !is_linux)) {
--- a/src/video_core/renderer_opengl/gl_driver.h
+++ b/src/video_core/renderer_opengl/gl_driver.h
@ -27,18 +27,13 @@ enum class Vendor {
 enum class DriverBug {
    // AMD drivers sometimes freezes when one shader stage is changed but not the others.
    ShaderStageChangeFreeze = 1 << 0,
    // On AMD drivers there is a strange crash in indexed drawing. The crash happens when the buffer
    // read position is near the end and is an out-of-bound access to the vertex buffer. This is
    // probably a bug in the driver and is related to the usage of vec3<byte> attributes in the
    // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the crash.
    VertexArrayOutOfBound = 1 << 1,
    // On AMD and Intel drivers on Windows glTextureView produces incorrect results
-    BrokenTextureView = 1 << 2,
+    BrokenTextureView = 1 << 1,
    // On Haswell and Broadwell Intel drivers glClearTexSubImage produces a black screen
-    BrokenClearTexture = 1 << 3,
+    BrokenClearTexture = 1 << 2,
    // On some Mali GPUs, the texture buffer size is small and has reduced performance
    // if the buffer is close to the maximum texture size
-    SlowTextureBufferWithBigSize = 1 << 4,
+    SlowTextureBufferWithBigSize = 1 << 3,
 };
 /**
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@ -10,35 +10,15 @@
 namespace OpenGL {
-OGLStreamBuffer::OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size,
+OGLStreamBuffer::OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size)
                                 bool prefer_coherent)
    : gl_target(target), buffer_size(size) {
    gl_buffer.Create();
    glBindBuffer(gl_target, gl_buffer.handle);
-
+    // prefer `glBufferData` than `glBufferStorage` on mobile device
-    GLsizeiptr allocate_size = size;
+    glBufferData(gl_target, buffer_size, nullptr, GL_STREAM_DRAW);
    if (driver.HasBug(DriverBug::VertexArrayOutOfBound) && target == GL_ARRAY_BUFFER) {
        allocate_size *= 2;
    }
    if (GLAD_GL_ARB_buffer_storage) {
        persistent = true;
        coherent = prefer_coherent;
        GLbitfield flags =
            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
        glBufferStorage(gl_target, allocate_size, nullptr, flags);
        mapped_ptr = static_cast<u8*>(glMapBufferRange(
            gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
    } else {
        glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW);
    }
 }
 OGLStreamBuffer::~OGLStreamBuffer() {
    if (persistent) {
        glBindBuffer(gl_target, gl_buffer.handle);
        glUnmapBuffer(gl_target);
    }
    gl_buffer.Release();
 }
@ -51,48 +31,33 @@ GLsizeiptr OGLStreamBuffer::GetSize() const {
 }
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
    ASSERT_MSG(size <= buffer_size, "Requested size {} exceeds buffer size {}", size, buffer_size);
    ASSERT(alignment <= buffer_size);
    mapped_size = size;
    if (alignment > 0) {
        buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
    }
    bool invalidate = false;
    buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
    if (buffer_pos + size > buffer_size) {
        buffer_pos = 0;
        invalidate = true;
        if (persistent) {
            glUnmapBuffer(gl_target);
        }
    }
-    if (invalidate || !persistent) {
+    GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
-        MANDARINE_PROFILE("OpenGL", "Stream Buffer Orphaning");
+                       (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
+    u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(gl_target, buffer_pos, size, flags));
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
+    return std::make_tuple(mapped_ptr, buffer_pos, invalidate);
                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
        mapped_ptr = static_cast<u8*>(
            glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
        mapped_offset = buffer_pos;
    }
    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
 }
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
-    ASSERT(size <= mapped_size);
+    if (size > 0) {
-
+        // flush is relative to the start of the currently mapped range of buffer
-    if (!coherent && size > 0) {
+        glFlushMappedBufferRange(gl_target, 0, size);
-        glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size);
+        GLenum error = glGetError();
        if (error != GL_NO_ERROR) {
            LOG_DEBUG(Render_OpenGL,
                      "flush mapped buffer range error: {:04X}, target: {:04X}, offset: {}, size: "
                      "{}, total: {}",
                      error, gl_target, buffer_pos, size, buffer_size);
        }
    }
-
+    glUnmapBuffer(gl_target);
    if (!persistent) {
        glUnmapBuffer(gl_target);
    }
    buffer_pos += size;
 }
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@ -13,8 +13,7 @@ class Driver;
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size,
+    explicit OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size);
                             bool prefer_coherent = false);
    ~OGLStreamBuffer();
    GLuint GetHandle() const;
@ -28,7 +27,7 @@ public:
     * and the invalidation flag for previous chunks.
     * The actual used size must be specified on unmapping the chunk.
     */
-    std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0);
+    std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment);
    void Unmap(GLsizeiptr size);
@ -36,14 +35,8 @@ private:
    OGLBuffer gl_buffer;
    GLenum gl_target;
    bool coherent = false;
    bool persistent = false;
    GLintptr buffer_pos = 0;
    GLsizeiptr buffer_size = 0;
    GLintptr mapped_offset = 0;
    GLsizeiptr mapped_size = 0;
    u8* mapped_ptr = nullptr;
 };
 } // namespace OpenGL