gl_stream_buffer: optimize OpenGL buffer handling
Some checks failed
mandarine-build / macos (arm64) (push) Waiting to run
mandarine-build / macos (x86_64) (push) Waiting to run
mandarine-build / macos-universal (push) Blocked by required conditions
mandarine-build / windows (msvc) (push) Waiting to run
mandarine-build / windows (msys2) (push) Waiting to run
mandarine-build / release (push) Blocked by required conditions
mandarine-build / source (push) Failing after 0s
mandarine-build / linux (appimage) (push) Failing after 0s
mandarine-build / linux (fresh) (push) Failing after 0s
mandarine-build / android (push) Failing after 0s
mandarine-format / clang-format (push) Failing after 0s

This seems to give a huge performance boost for some Mali GPU devices.
This commit is contained in:
weihuoya 2024-09-16 22:13:55 +02:00 committed by Gamer64
parent 7031479af0
commit 107837f506
4 changed files with 26 additions and 73 deletions

View File

@ -189,7 +189,7 @@ void Driver::FindBugs() {
// TODO: Check if these have been fixed in the newer driver // TODO: Check if these have been fixed in the newer driver
if (vendor == Vendor::AMD) { if (vendor == Vendor::AMD) {
bugs |= DriverBug::ShaderStageChangeFreeze | DriverBug::VertexArrayOutOfBound; bugs |= DriverBug::ShaderStageChangeFreeze;
} }
if (vendor == Vendor::AMD || (vendor == Vendor::Intel && !is_linux)) { if (vendor == Vendor::AMD || (vendor == Vendor::Intel && !is_linux)) {

View File

@ -27,18 +27,13 @@ enum class Vendor {
enum class DriverBug { enum class DriverBug {
// AMD drivers sometimes freezes when one shader stage is changed but not the others. // AMD drivers sometimes freezes when one shader stage is changed but not the others.
ShaderStageChangeFreeze = 1 << 0, ShaderStageChangeFreeze = 1 << 0,
// On AMD drivers there is a strange crash in indexed drawing. The crash happens when the buffer
// read position is near the end and is an out-of-bound access to the vertex buffer. This is
// probably a bug in the driver and is related to the usage of vec3<byte> attributes in the
// vertex array. Doubling the allocation size for the vertex buffer seems to avoid the crash.
VertexArrayOutOfBound = 1 << 1,
// On AMD and Intel drivers on Windows glTextureView produces incorrect results // On AMD and Intel drivers on Windows glTextureView produces incorrect results
BrokenTextureView = 1 << 2, BrokenTextureView = 1 << 1,
// On Haswell and Broadwell Intel drivers glClearTexSubImage produces a black screen // On Haswell and Broadwell Intel drivers glClearTexSubImage produces a black screen
BrokenClearTexture = 1 << 3, BrokenClearTexture = 1 << 2,
// On some Mali GPUs, the texture buffer size is small and has reduced performance // On some Mali GPUs, the texture buffer size is small and has reduced performance
// if the buffer is close to the maximum texture size // if the buffer is close to the maximum texture size
SlowTextureBufferWithBigSize = 1 << 4, SlowTextureBufferWithBigSize = 1 << 3,
}; };
/** /**

View File

@ -10,35 +10,15 @@
namespace OpenGL { namespace OpenGL {
OGLStreamBuffer::OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size, OGLStreamBuffer::OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size)
bool prefer_coherent)
: gl_target(target), buffer_size(size) { : gl_target(target), buffer_size(size) {
gl_buffer.Create(); gl_buffer.Create();
glBindBuffer(gl_target, gl_buffer.handle); glBindBuffer(gl_target, gl_buffer.handle);
// prefer `glBufferData` than `glBufferStorage` on mobile device
GLsizeiptr allocate_size = size; glBufferData(gl_target, buffer_size, nullptr, GL_STREAM_DRAW);
if (driver.HasBug(DriverBug::VertexArrayOutOfBound) && target == GL_ARRAY_BUFFER) {
allocate_size *= 2;
}
if (GLAD_GL_ARB_buffer_storage) {
persistent = true;
coherent = prefer_coherent;
GLbitfield flags =
GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
glBufferStorage(gl_target, allocate_size, nullptr, flags);
mapped_ptr = static_cast<u8*>(glMapBufferRange(
gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
} else {
glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW);
}
} }
OGLStreamBuffer::~OGLStreamBuffer() { OGLStreamBuffer::~OGLStreamBuffer() {
if (persistent) {
glBindBuffer(gl_target, gl_buffer.handle);
glUnmapBuffer(gl_target);
}
gl_buffer.Release(); gl_buffer.Release();
} }
@ -51,48 +31,33 @@ GLsizeiptr OGLStreamBuffer::GetSize() const {
} }
std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
ASSERT_MSG(size <= buffer_size, "Requested size {} exceeds buffer size {}", size, buffer_size);
ASSERT(alignment <= buffer_size);
mapped_size = size;
if (alignment > 0) {
buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
}
bool invalidate = false; bool invalidate = false;
buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
if (buffer_pos + size > buffer_size) { if (buffer_pos + size > buffer_size) {
buffer_pos = 0; buffer_pos = 0;
invalidate = true; invalidate = true;
if (persistent) {
glUnmapBuffer(gl_target);
}
} }
if (invalidate || !persistent) { GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT |
MANDARINE_PROFILE("OpenGL", "Stream Buffer Orphaning"); (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) | u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(gl_target, buffer_pos, size, flags));
(coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) | return std::make_tuple(mapped_ptr, buffer_pos, invalidate);
(invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
mapped_ptr = static_cast<u8*>(
glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
mapped_offset = buffer_pos;
}
return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
} }
void OGLStreamBuffer::Unmap(GLsizeiptr size) { void OGLStreamBuffer::Unmap(GLsizeiptr size) {
ASSERT(size <= mapped_size); if (size > 0) {
// flush is relative to the start of the currently mapped range of buffer
if (!coherent && size > 0) { glFlushMappedBufferRange(gl_target, 0, size);
glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size); GLenum error = glGetError();
if (error != GL_NO_ERROR) {
LOG_DEBUG(Render_OpenGL,
"flush mapped buffer range error: {:04X}, target: {:04X}, offset: {}, size: "
"{}, total: {}",
error, gl_target, buffer_pos, size, buffer_size);
}
} }
glUnmapBuffer(gl_target);
if (!persistent) {
glUnmapBuffer(gl_target);
}
buffer_pos += size; buffer_pos += size;
} }

View File

@ -13,8 +13,7 @@ class Driver;
class OGLStreamBuffer : private NonCopyable { class OGLStreamBuffer : private NonCopyable {
public: public:
explicit OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size, explicit OGLStreamBuffer(Driver& driver, GLenum target, GLsizeiptr size);
bool prefer_coherent = false);
~OGLStreamBuffer(); ~OGLStreamBuffer();
GLuint GetHandle() const; GLuint GetHandle() const;
@ -28,7 +27,7 @@ public:
* and the invalidation flag for previous chunks. * and the invalidation flag for previous chunks.
* The actual used size must be specified on unmapping the chunk. * The actual used size must be specified on unmapping the chunk.
*/ */
std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0); std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment);
void Unmap(GLsizeiptr size); void Unmap(GLsizeiptr size);
@ -36,14 +35,8 @@ private:
OGLBuffer gl_buffer; OGLBuffer gl_buffer;
GLenum gl_target; GLenum gl_target;
bool coherent = false;
bool persistent = false;
GLintptr buffer_pos = 0; GLintptr buffer_pos = 0;
GLsizeiptr buffer_size = 0; GLsizeiptr buffer_size = 0;
GLintptr mapped_offset = 0;
GLsizeiptr mapped_size = 0;
u8* mapped_ptr = nullptr;
}; };
} // namespace OpenGL } // namespace OpenGL