From 128fcdac26a9f7b6eb5174b88483c7ca023f210b Mon Sep 17 00:00:00 2001 From: degasus Date: Thu, 23 Jan 2014 00:47:49 +0100 Subject: [PATCH] OpenGL: refactor all of our StreamBuffers The old way was to use big switch/case statements based on a type of buffer. The new one is to use inheritance. This change prohibits us to change the buffer type while running, but I doubt we'll ever do so. Performance should also be a bit better. Also a nice cleanup. Added some comments about this different kind of buffers. --- .../VideoBackends/OGL/ProgramShaderCache.cpp | 14 +- .../Core/VideoBackends/OGL/StreamBuffer.cpp | 545 +++++++++++------- Source/Core/VideoBackends/OGL/StreamBuffer.h | 49 +- .../Core/VideoBackends/OGL/VertexManager.cpp | 21 +- 4 files changed, 373 insertions(+), 256 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 8f3a545261..000398c787 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -193,18 +193,18 @@ void ProgramShaderCache::UploadConstants() { if(PixelShaderManager::dirty || VertexShaderManager::dirty) { - u8* buffer = s_buffer->Map(s_ubo_buffer_size, s_ubo_align); + auto buffer = s_buffer->Map(s_ubo_buffer_size, s_ubo_align); - memcpy(buffer, + memcpy(buffer.first, &PixelShaderManager::constants, sizeof(PixelShaderConstants)); - memcpy(buffer + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), + memcpy(buffer.first + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), &VertexShaderManager::constants, sizeof(VertexShaderConstants)); - size_t offset = s_buffer->Unmap(s_ubo_buffer_size); - glBindBufferRange(GL_UNIFORM_BUFFER, 1, s_buffer->getBuffer(), offset, + s_buffer->Unmap(s_ubo_buffer_size); + glBindBufferRange(GL_UNIFORM_BUFFER, 1, s_buffer->m_buffer, buffer.second, sizeof(PixelShaderConstants)); - glBindBufferRange(GL_UNIFORM_BUFFER, 2, s_buffer->getBuffer(), offset + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), + glBindBufferRange(GL_UNIFORM_BUFFER, 2, s_buffer->m_buffer, buffer.second + ROUND_UP(sizeof(PixelShaderConstants), s_ubo_align), sizeof(VertexShaderConstants)); PixelShaderManager::dirty = false; @@ -471,7 +471,7 @@ void ProgramShaderCache::Init(void) // We multiply by *4*4 because we need to get down to basic machine units. // So multiply by four to get how many floats we have from vec4s // Then once more to get bytes - s_buffer = new StreamBuffer(GL_UNIFORM_BUFFER, UBO_LENGTH); + s_buffer = StreamBuffer::Create(GL_UNIFORM_BUFFER, UBO_LENGTH); } // Read our shader cache, only if supported diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 28585e07d7..71bfa404f6 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -13,233 +13,60 @@ namespace OGL { -static const u32 SYNC_POINTS = 16; -static const u32 ALIGN_PINNED_MEMORY = 4096; +// moved out of constructor, so m_buffer is allowed to be const +static u32 genBuffer() +{ + u32 id; + glGenBuffers(1, &id); + return id; +} StreamBuffer::StreamBuffer(u32 type, size_t size) -: m_buffertype(type), m_size(size) -{ - glGenBuffers(1, &m_buffer); - - bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation"); - - if (g_ogl_config.bSupportsGLBufferStorage && - !(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER)) - m_uploadtype = BUFFERSTORAGE; - else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM)) - m_uploadtype = BUFFERSUBDATA; - else if(!g_ogl_config.bSupportsGLBaseVertex) - m_uploadtype = BUFFERDATA; - else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory && - !(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER)) - m_uploadtype = PINNED_MEMORY; - else if(nvidia) - m_uploadtype = BUFFERSUBDATA; - else if(g_ogl_config.bSupportsGLSync) - m_uploadtype = MAP_AND_SYNC; - else - m_uploadtype = MAP_AND_ORPHAN; - - Init(); -} - -StreamBuffer::~StreamBuffer() -{ - Shutdown(); - glDeleteBuffers(1, &m_buffer); -} - -#define SLOT(x) ((x)*SYNC_POINTS/m_size) - -u8* StreamBuffer::Map ( size_t size, u32 stride ) -{ - if(m_iterator && stride) { - m_iterator--; - m_iterator = m_iterator - (m_iterator % stride) + stride; - } - - switch(m_uploadtype) { - case MAP_AND_ORPHAN: - if(m_iterator + size >= m_size) { - glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW); - m_iterator = 0; - } - break; - case MAP_AND_SYNC: - case PINNED_MEMORY: - case BUFFERSTORAGE: - // insert waiting slots for used memory - for (size_t i = SLOT(m_used_iterator); i < SLOT(m_iterator); i++) - { - fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - } - m_used_iterator = m_iterator; - - // wait for new slots to end of buffer - for (size_t i = SLOT(m_free_iterator) + 1; i <= SLOT(m_iterator + size) && i < SYNC_POINTS; i++) - { - glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); - glDeleteSync(fences[i]); - } - m_free_iterator = m_iterator + size; - - // if buffer is full - if (m_iterator + size >= m_size) { - - // insert waiting slots in unused space at the end of the buffer - for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++) - { - fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - } - - // move to the start - m_used_iterator = m_iterator = 0; // offset 0 is always aligned - - // wait for space at the start - for (u32 i = 0; i <= SLOT(m_iterator + size); i++) - { - glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); - glDeleteSync(fences[i]); - } - m_free_iterator = m_iterator + size; - } - break; - case BUFFERSUBDATA: - case BUFFERDATA: - m_iterator = 0; - break; - } - - // MAP_AND_* methods need to remap this buffer every time - switch(m_uploadtype) { - case MAP_AND_ORPHAN: - case MAP_AND_SYNC: - pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, - GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT) - m_iterator; - break; - case PINNED_MEMORY: - case BUFFERSTORAGE: - case BUFFERSUBDATA: - case BUFFERDATA: - break; - } - return pointer + m_iterator; -} - -size_t StreamBuffer::Unmap(size_t used_size) -{ - size_t ret = m_iterator; - switch(m_uploadtype) { - case MAP_AND_SYNC: - case MAP_AND_ORPHAN: - glFlushMappedBufferRange(m_buffertype, 0, used_size); - glUnmapBuffer(m_buffertype); - break; - case PINNED_MEMORY: - case BUFFERSTORAGE: - case BUFFERSUBDATA: - glBufferSubData(m_buffertype, 0, used_size, pointer); - break; - case BUFFERDATA: - glBufferData(m_buffertype, used_size, pointer, GL_STREAM_DRAW); - break; - } - m_iterator += used_size; - return ret; -} - -void StreamBuffer::Init() +: m_buffer(genBuffer()), m_buffertype(type), m_size(size) { m_iterator = 0; m_used_iterator = 0; m_free_iterator = 0; - - switch(m_uploadtype) { - case MAP_AND_SYNC: - fences = new GLsync[SYNC_POINTS]; - for(u32 i=0; i= m_size) { + + // insert waiting slots in unused space at the end of the buffer + for (size_t i = SLOT(m_used_iterator); i < SYNC_POINTS; i++) + { + fences[i] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + } + + // move to the start + m_used_iterator = m_iterator = 0; // offset 0 is always aligned + + // wait for space at the start + for (u32 i = 0; i <= SLOT(m_iterator + size); i++) + { + glClientWaitSync(fences[i], GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); + glDeleteSync(fences[i]); + } + m_free_iterator = m_iterator + size; + } +} +#undef SLOT + +void StreamBuffer::Align(u32 stride) +{ + if(m_iterator && stride) { + m_iterator--; + m_iterator = m_iterator - (m_iterator % stride) + stride; + } +} + +/* The usual way to stream data to the gpu. + * Described here: https://www.opengl.org/wiki/Buffer_Object_Streaming#Unsynchronized_buffer_mapping + * Just do unsync appends until the buffer is full. + * When it's full, orphan (alloc a new buffer and free the old one) + * + * As reallocation is an overhead, this method isn't as fast as it is known to be. + */ +class MapAndOrphan : public StreamBuffer +{ +public: + MapAndOrphan(u32 type, size_t size) : StreamBuffer(type, size) { + glBindBuffer(m_buffertype, m_buffer); + glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW); + } + + ~MapAndOrphan() { + } + + std::pair Map(size_t size, u32 stride) { + Align(stride); + if(m_iterator + size >= m_size) { + glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW); + m_iterator = 0; + } + u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, + GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT); + return std::make_pair(pointer, m_iterator); + } + + void Unmap(size_t used_size) { + glFlushMappedBufferRange(m_buffertype, 0, used_size); + glUnmapBuffer(m_buffertype); + m_iterator += used_size; + } +}; + +/* A modified streaming way without reallocation + * This one fixes the reallocation overhead of the MapAndOrphan one. + * So it alloc a ring buffer on initialization. + * But with this limited ressource, we have to care about the cpu-gpu distance. + * Else this fifo may overflow. + * So we had traded orphan vs syncing. + */ +class MapAndSync : public StreamBuffer +{ +public: + MapAndSync(u32 type, size_t size) : StreamBuffer(type, size) { + CreateFences(); + glBindBuffer(m_buffertype, m_buffer); + glBufferData(m_buffertype, m_size, NULL, GL_STREAM_DRAW); + } + + ~MapAndSync() { + DeleteFences(); + } + + std::pair Map(size_t size, u32 stride) { + Align(stride); + AllocMemory(size); + u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size, + GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT); + return std::make_pair(pointer, m_iterator); + } + + void Unmap(size_t used_size) { + glFlushMappedBufferRange(m_buffertype, 0, used_size); + glUnmapBuffer(m_buffertype); + m_iterator += used_size; + } +}; + +/* Streaming fifo without mapping ovearhead. + * This one usually requires ARB_buffer_storage (OpenGL 4.4). + * And is usually not available on OpenGL3 gpus. + * + * ARB_buffer_storage allows us to render from a mapped buffer. + * So we map it persistently in the initialization. + * + * Unsync mapping sounds like an easy task, but it isn't for threaded drivers. + * So every mapping on current close-source driver _will_ end in + * at least a round trip time between two threads. + * + * As persistently mapped buffer can't use orphaning, we also have to sync. + */ +class BufferStorage : public StreamBuffer +{ +public: + BufferStorage(u32 type, size_t size) : StreamBuffer(type, size) { + CreateFences(); + glBindBuffer(m_buffertype, m_buffer); + + // PERSISTANT_BIT to make sure that the buffer can be used while mapped + // COHERENT_BIT is set so we don't have to use a MemoryBarrier on write + // CLIENT_STORAGE_BIT is set since we access the buffer more frequently on the client side then server side + glBufferStorage(m_buffertype, m_size, NULL, + GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | GL_CLIENT_STORAGE_BIT); + m_pointer = (u8*)glMapBufferRange(m_buffertype, 0, m_size, + GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT); + } + + ~BufferStorage() { + DeleteFences(); + glUnmapBuffer(m_buffertype); + glBindBuffer(m_buffertype, 0); + } + + std::pair Map(size_t size, u32 stride) { + Align(stride); + AllocMemory(size); + return std::make_pair(m_pointer + m_iterator, m_iterator); + } + + void Unmap(size_t used_size) { + m_iterator += used_size; + } + + u8* m_pointer; +}; + +/* --- AMD only --- + * Another streaming fifo without mapping overhead. + * As we can't orphan without mapping, we have to sync. + * + * This one uses AMD_pinned_memory which is available on all AMD gpus. + * OpenGL 4.4 drivers should use BufferStorage. + */ +class PinnedMemory : public StreamBuffer +{ +public: + PinnedMemory(u32 type, size_t size) : StreamBuffer(type, size) { + CreateFences(); + m_pointer = (u8*)AllocateAlignedMemory(ROUND_UP(m_size,ALIGN_PINNED_MEMORY), ALIGN_PINNED_MEMORY ); + glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_buffer); + glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, ROUND_UP(m_size,ALIGN_PINNED_MEMORY), m_pointer, GL_STREAM_COPY); + glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, 0); + glBindBuffer(m_buffertype, m_buffer); + } + + ~PinnedMemory() { + DeleteFences(); + glBindBuffer(m_buffertype, 0); + glFinish(); // ogl pipeline must be flushed, else this buffer can be in use + FreeAlignedMemory(m_pointer); + } + + std::pair Map(size_t size, u32 stride) { + Align(stride); + AllocMemory(size); + return std::make_pair(m_pointer + m_iterator, m_iterator); + } + + void Unmap(size_t used_size) { + m_iterator += used_size; + } + + u8* m_pointer; + static const u32 ALIGN_PINNED_MEMORY = 4096; +}; + +/* Fifo based on the glBufferSubData call. + * As everything must be copied before glBufferSubData returns, + * an additional memcpy in the driver will be done. + * So this is a huge overhead, only use it if required. + */ +class BufferSubData : public StreamBuffer +{ +public: + BufferSubData(u32 type, size_t size) : StreamBuffer(type, size) { + glBindBuffer(m_buffertype, m_buffer); + glBufferData(m_buffertype, size, 0, GL_STATIC_DRAW); + m_pointer = new u8[m_size]; + } + + ~BufferSubData() { + delete [] m_pointer; + } + + std::pair Map(size_t size, u32 stride) { + return std::make_pair(m_pointer, 0); + } + + void Unmap(size_t used_size) { + glBufferSubData(m_buffertype, 0, used_size, m_pointer); + } + + u8* m_pointer; +}; + +/* Fifo based on the glBufferData call. + * Some trashy drivers stall in BufferSubData. + * So here we use glBufferData, which realloc this buffer every time. + * This may avoid stalls, but it is a bigger overhead than BufferSubData. + */ +class BufferData : public StreamBuffer +{ +public: + BufferData(u32 type, size_t size) : StreamBuffer(type, size) { + glBindBuffer(m_buffertype, m_buffer); + m_pointer = new u8[m_size]; + } + + ~BufferData() { + delete [] m_pointer; + } + + std::pair Map(size_t size, u32 stride) { + return std::make_pair(m_pointer, 0); + } + + void Unmap(size_t used_size) { + glBufferData(m_buffertype, used_size, m_pointer, GL_STREAM_DRAW); + } + + u8* m_pointer; +}; + +// choose best streaming library based on the supported extensions and known issues +StreamBuffer* StreamBuffer::Create(u32 type, size_t size) +{ + bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation"); + + if (g_ogl_config.bSupportsGLBufferStorage && + !(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER)) + return new BufferStorage(type, size); + else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM)) + return new BufferSubData(type, size); + else if(!g_ogl_config.bSupportsGLBaseVertex) + return new BufferData(type, size); + else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory && + !(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER)) + return new PinnedMemory(type, size); + else if(nvidia) + return new BufferSubData(type, size); + else if(g_ogl_config.bSupportsGLSync) + return new MapAndSync(type, size); + else + return new MapAndOrphan(type, size); +} } diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.h b/Source/Core/VideoBackends/OGL/StreamBuffer.h index abef139546..1f64fa56ec 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.h +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.h @@ -5,6 +5,7 @@ #ifndef STREAMBUFFER_H #define STREAMBUFFER_H +#include #include "VideoCommon.h" #include "FramebufferManager.h" #include "GLUtil.h" @@ -17,39 +18,41 @@ namespace OGL { -enum StreamType { - MAP_AND_ORPHAN = (1 << 1), - MAP_AND_SYNC = (1 << 2), - PINNED_MEMORY = (1 << 3), - BUFFERSUBDATA = (1 << 4), - BUFFERDATA = (1 << 5), - BUFFERSTORAGE = (1 << 6), -}; class StreamBuffer { public: + static StreamBuffer* Create(u32 type, size_t size); + virtual ~StreamBuffer(); + + /* This mapping function will return a pair of: + * - the pointer to the mapped buffer + * - the offset into the real gpu buffer (always multiple of stride) + * On mapping, the maximum of size for allocation has to be set. + * The size really pushed into this fifo only has to be known on Unmapping. + * Mapping invalidates the current buffer content, + * so it isn't allowed to access the old content any more. + */ + virtual std::pair Map(size_t size, u32 stride = 0) = 0; + virtual void Unmap(size_t used_size) = 0; + + const u32 m_buffer; + +protected: StreamBuffer(u32 type, size_t size); - ~StreamBuffer(); - - u8* Map(size_t size, u32 stride = 0); - size_t Unmap(size_t used_size); // returns the offset of the beginning of the uploaded block - - inline u32 getBuffer() { return m_buffer; } - -private: - void Init(); - void Shutdown(); + void CreateFences(); void DeleteFences(); + void AllocMemory(size_t size); + void Align(u32 stride); - StreamType m_uploadtype; - u32 m_buffer; - u32 m_buffertype; - size_t m_size; - u8 *pointer; + const u32 m_buffertype; + const size_t m_size; + size_t m_iterator; size_t m_used_iterator; size_t m_free_iterator; + +private: GLsync *fences; }; diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 07a6a24196..7e1ead8954 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -58,11 +58,11 @@ VertexManager::~VertexManager() void VertexManager::CreateDeviceObjects() { - s_vertexBuffer = new StreamBuffer(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE); - m_vertex_buffers = s_vertexBuffer->getBuffer(); + s_vertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, MAX_VBUFFER_SIZE); + m_vertex_buffers = s_vertexBuffer->m_buffer; - s_indexBuffer = new StreamBuffer(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE); - m_index_buffers = s_indexBuffer->getBuffer(); + s_indexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, MAX_IBUFFER_SIZE); + m_index_buffers = s_indexBuffer->m_buffer; m_CurrentVertexFmt = NULL; m_last_vao = 0; @@ -85,14 +85,15 @@ void VertexManager::PrepareDrawBuffers(u32 stride) u32 vertex_data_size = IndexGenerator::GetNumVerts() * stride; u32 index_data_size = IndexGenerator::GetIndexLen() * sizeof(u16); - u8* buffer = s_vertexBuffer->Map(vertex_data_size, stride); - memcpy(buffer, GetVertexBuffer(), vertex_data_size); - size_t offset = s_vertexBuffer->Unmap(vertex_data_size); - s_baseVertex = offset / stride; + auto buffer = s_vertexBuffer->Map(vertex_data_size, stride); + memcpy(buffer.first, GetVertexBuffer(), vertex_data_size); + s_vertexBuffer->Unmap(vertex_data_size); + s_baseVertex = buffer.second / stride; buffer = s_indexBuffer->Map(index_data_size); - memcpy(buffer, GetIndexBuffer(), index_data_size); - s_index_offset = s_indexBuffer->Unmap(index_data_size); + memcpy(buffer.first, GetIndexBuffer(), index_data_size); + s_indexBuffer->Unmap(index_data_size); + s_index_offset = buffer.second; ADDSTAT(stats.thisFrame.bytesVertexStreamed, vertex_data_size); ADDSTAT(stats.thisFrame.bytesIndexStreamed, index_data_size);