OGL: Re-implement async shader compiling

2024-10-07 15:53:34 +00:00 · 2018-02-25 17:56:09 +10:00 · 2018-02-25 17:56:09 +10:00 · f9c829c7f7
commit f9c829c7f7
parent dec0c3bce8
16 changed files with 136 additions and 33 deletions
--- a/Source/Core/VideoBackends/D3D/main.cpp
+++ b/Source/Core/VideoBackends/D3D/main.cpp
@ -70,6 +70,7 @@ void VideoBackend::InitBackendInfo()
  g_Config.backend_info.bSupportsDynamicSamplerIndexing = false;
  g_Config.backend_info.bSupportsBPTCTextures = false;
  g_Config.backend_info.bSupportsFramebufferFetch = false;
+  g_Config.backend_info.bSupportsBackgroundCompiling = true;

  IDXGIFactory2* factory;
  IDXGIAdapter* ad;
--- a/Source/Core/VideoBackends/Null/NullBackend.cpp
+++ b/Source/Core/VideoBackends/Null/NullBackend.cpp
@ -46,6 +46,7 @@ void VideoBackend::InitBackendInfo()
  g_Config.backend_info.bSupportsST3CTextures = false;
  g_Config.backend_info.bSupportsBPTCTextures = false;
  g_Config.backend_info.bSupportsFramebufferFetch = false;
+  g_Config.backend_info.bSupportsBackgroundCompiling = false;

  // aamodes: We only support 1 sample, so no MSAA
  g_Config.backend_info.Adapters.clear();
--- a/Source/Core/VideoBackends/OGL/OGLPipeline.cpp
+++ b/Source/Core/VideoBackends/OGL/OGLPipeline.cpp
@ -46,10 +46,11 @@ OGLPipeline::~OGLPipeline()

 std::unique_ptr<OGLPipeline> OGLPipeline::Create(const AbstractPipelineConfig& config)
 {
-  const PipelineProgram* program =
-      ProgramShaderCache::GetPipelineProgram(static_cast<const OGLShader*>(config.vertex_shader),
-                                             static_cast<const OGLShader*>(config.geometry_shader),
-                                             static_cast<const OGLShader*>(config.pixel_shader));
+  const PipelineProgram* program = ProgramShaderCache::GetPipelineProgram(
+      static_cast<const GLVertexFormat*>(config.vertex_format),
+      static_cast<const OGLShader*>(config.vertex_shader),
+      static_cast<const OGLShader*>(config.geometry_shader),
+      static_cast<const OGLShader*>(config.pixel_shader));
  if (!program)
    return nullptr;

--- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
+++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
@ -51,9 +51,10 @@ static std::unique_ptr<StreamBuffer> s_buffer;
 static int num_failures = 0;

 static GLuint CurrentProgram = 0;
-ProgramShaderCache::PipelineProgramMap ProgramShaderCache::pipelineprograms;
-std::mutex ProgramShaderCache::pipelineprogramlock;
+ProgramShaderCache::PipelineProgramMap ProgramShaderCache::s_pipeline_programs;
+std::mutex ProgramShaderCache::s_pipeline_program_lock;
 static std::string s_glsl_header = "";
+static thread_local bool s_is_shared_context = false;

 static std::string GetGLSLVersionString()
 {
@ -506,8 +507,8 @@ void ProgramShaderCache::Shutdown()
  s_last_VAO = 0;

  // All pipeline programs should have been released.
-  _dbg_assert_(VIDEO, pipelineprograms.empty());
-  pipelineprograms.clear();
+  _dbg_assert_(VIDEO, s_pipeline_programs.empty());
+  s_pipeline_programs.clear();
 }

 void ProgramShaderCache::CreateAttributelessVAO()
@ -548,21 +549,28 @@ void ProgramShaderCache::InvalidateLastProgram()
  CurrentProgram = 0;
 }

-const PipelineProgram* ProgramShaderCache::GetPipelineProgram(const OGLShader* vertex_shader,
+const PipelineProgram* ProgramShaderCache::GetPipelineProgram(const GLVertexFormat* vertex_format,
+                                                              const OGLShader* vertex_shader,
                                                              const OGLShader* geometry_shader,
                                                              const OGLShader* pixel_shader)
 {
  PipelineProgramKey key = {vertex_shader, geometry_shader, pixel_shader};
  {
-    std::lock_guard<std::mutex> guard(pipelineprogramlock);
-    auto iter = pipelineprograms.find(key);
-    if (iter != pipelineprograms.end())
+    std::lock_guard<std::mutex> guard(s_pipeline_program_lock);
+    auto iter = s_pipeline_programs.find(key);
+    if (iter != s_pipeline_programs.end())
    {
      iter->second->reference_count++;
      return iter->second.get();
    }
  }

+  // We temporarily change the vertex array to the pipeline's vertex format.
+  // This can prevent the NVIDIA OpenGL driver from recompiling on first use.
+  GLuint vao = vertex_format ? vertex_format->VAO : s_attributeless_VAO;
+  if (s_is_shared_context || vao != s_last_VAO)
+    glBindVertexArray(vao);
+
  std::unique_ptr<PipelineProgram> prog = std::make_unique<PipelineProgram>();
  prog->key = key;

@ -581,6 +589,11 @@ const PipelineProgram* ProgramShaderCache::GetPipelineProgram(const OGLShader* v
  // Link program.
  prog->shader.SetProgramBindings(false);
  glLinkProgram(prog->shader.glprogid);
+
+  // Restore VAO binding after linking.
+  if (!s_is_shared_context && vao != s_last_VAO)
+    glBindVertexArray(s_last_VAO);
+
  if (!ProgramShaderCache::CheckProgramLinkResult(prog->shader.glprogid, {}, {}, {}))
  {
    prog->shader.Destroy();
@ -588,9 +601,9 @@ const PipelineProgram* ProgramShaderCache::GetPipelineProgram(const OGLShader* v
  }

  // Lock to insert. A duplicate program may have been created in the meantime.
-  std::lock_guard<std::mutex> guard(pipelineprogramlock);
-  auto iter = pipelineprograms.find(key);
-  if (iter != pipelineprograms.end())
+  std::lock_guard<std::mutex> guard(s_pipeline_program_lock);
+  auto iter = s_pipeline_programs.find(key);
+  if (iter != s_pipeline_programs.end())
  {
    // Destroy this program, and use the one which was created first.
    prog->shader.Destroy();
@ -601,19 +614,25 @@ const PipelineProgram* ProgramShaderCache::GetPipelineProgram(const OGLShader* v
  // Set program variables on the shader which will be returned.
  // This is only needed for drivers which don't support binding layout.
  prog->shader.SetProgramVariables();
-  auto ip = pipelineprograms.emplace(key, std::move(prog));
+
+  // If this is a shared context, ensure we sync before we return the program to
+  // the main thread. If we don't do this, some driver can lock up (e.g. AMD).
+  if (s_is_shared_context)
+    glFinish();
+
+  auto ip = s_pipeline_programs.emplace(key, std::move(prog));
  return ip.first->second.get();
 }

 void ProgramShaderCache::ReleasePipelineProgram(const PipelineProgram* prog)
 {
-  auto iter = pipelineprograms.find(prog->key);
-  _assert_(iter != pipelineprograms.end() && prog == iter->second.get());
+  auto iter = s_pipeline_programs.find(prog->key);
+  _assert_(iter != s_pipeline_programs.end() && prog == iter->second.get());

  if (--iter->second->reference_count == 0)
  {
    iter->second->shader.Destroy();
-    pipelineprograms.erase(iter);
+    s_pipeline_programs.erase(iter);
  }
 }

@ -783,4 +802,55 @@ void ProgramShaderCache::CreateHeader()
      v > GlslEs300 ? "precision highp sampler2DMS;" : "",
      v >= GlslEs310 ? "precision highp image2DArray;" : "");
 }
+
+bool SharedContextAsyncShaderCompiler::WorkerThreadInitMainThread(void** param)
+{
+  std::unique_ptr<cInterfaceBase> context = GLInterface->CreateSharedContext();
+  if (!context)
+  {
+    PanicAlert("Failed to create shared context for shader compiling.");
+    return false;
+  }
+
+  *param = context.release();
+  return true;
+}
+
+bool SharedContextAsyncShaderCompiler::WorkerThreadInitWorkerThread(void* param)
+{
+  cInterfaceBase* context = static_cast<cInterfaceBase*>(param);
+  if (!context->MakeCurrent())
+    return false;
+
+  s_is_shared_context = true;
+  if (g_ActiveConfig.backend_info.bSupportsPrimitiveRestart)
+  {
+    if (GLInterface->GetMode() == GLInterfaceMode::MODE_OPENGLES3)
+    {
+      glEnable(GL_PRIMITIVE_RESTART_FIXED_INDEX);
+    }
+    else
+    {
+      if (GLExtensions::Version() >= 310)
+      {
+        glEnable(GL_PRIMITIVE_RESTART);
+        glPrimitiveRestartIndex(65535);
+      }
+      else
+      {
+        glEnableClientState(GL_PRIMITIVE_RESTART_NV);
+        glPrimitiveRestartIndexNV(65535);
+      }
+    }
+  }
+
+  return true;
+}
+
+void SharedContextAsyncShaderCompiler::WorkerThreadExit(void* param)
+{
+  cInterfaceBase* context = static_cast<cInterfaceBase*>(param);
+  context->ClearCurrent();
+  delete context;
+}
 }  // namespace OGL
--- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.h
+++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.h
@ -11,6 +11,7 @@
 #include <unordered_map>

 #include "Common/GL/GLUtil.h"
+#include "VideoCommon/AsyncShaderCompiler.h"

 namespace OGL
 {
@ -87,7 +88,8 @@ public:
  static void Shutdown();
  static void CreateHeader();

-  static const PipelineProgram* GetPipelineProgram(const OGLShader* vertex_shader,
+  static const PipelineProgram* GetPipelineProgram(const GLVertexFormat* vertex_format,
+                                                   const OGLShader* vertex_shader,
                                                   const OGLShader* geometry_shader,
                                                   const OGLShader* pixel_shader);
  static void ReleasePipelineProgram(const PipelineProgram* prog);
@ -99,8 +101,8 @@ private:

  static void CreateAttributelessVAO();

-  static PipelineProgramMap pipelineprograms;
-  static std::mutex pipelineprogramlock;
+  static PipelineProgramMap s_pipeline_programs;
+  static std::mutex s_pipeline_program_lock;

  static u32 s_ubo_buffer_size;
  static s32 s_ubo_align;
@ -110,4 +112,12 @@ private:
  static GLuint s_last_VAO;
 };

+class SharedContextAsyncShaderCompiler : public VideoCommon::AsyncShaderCompiler
+{
+protected:
+  bool WorkerThreadInitMainThread(void** param) override;
+  bool WorkerThreadInitWorkerThread(void* param) override;
+  void WorkerThreadExit(void* param) override;
+};
+
 }  // namespace OGL
--- a/Source/Core/VideoBackends/OGL/Render.cpp
+++ b/Source/Core/VideoBackends/OGL/Render.cpp
@ -81,8 +81,8 @@ static bool s_efbCacheIsCleared = false;
 static std::vector<u32>
    s_efbCache[2][EFB_CACHE_WIDTH * EFB_CACHE_HEIGHT];  // 2 for PeekZ and PeekColor

-static void APIENTRY ErrorCallback(GLenum source, GLenum type, GLuint id, GLenum severity,
-                                   GLsizei length, const char* message, const void* userParam)
+void APIENTRY ErrorCallback(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length,
+                            const char* message, const void* userParam)
 {
  const char* s_source;
  const char* s_type;
@ -677,6 +677,10 @@ Renderer::Renderer()
      g_Config.backend_info.bSupportsPaletteConversion &&
      g_Config.backend_info.bSupportsComputeShaders && g_ogl_config.bSupportsImageLoadStore;

+  // Background compiling is supported only when shared contexts aren't broken.
+  g_Config.backend_info.bSupportsBackgroundCompiling =
+      !DriverDetails::HasBug(DriverDetails::BUG_SHARED_CONTEXT_SHADER_COMPILATION);
+
  if (g_ogl_config.bSupportsDebug)
  {
    if (GLExtensions::Supports("GL_KHR_debug"))
@ -1695,4 +1699,9 @@ void Renderer::DispatchComputeShader(const AbstractShader* shader, const void* u
  glDispatchCompute(groups_x, groups_y, groups_z);
  ProgramShaderCache::InvalidateLastProgram();
 }
+
+std::unique_ptr<VideoCommon::AsyncShaderCompiler> Renderer::CreateAsyncShaderCompiler()
+{
+  return std::make_unique<SharedContextAsyncShaderCompiler>();
+}
 }
--- a/Source/Core/VideoBackends/OGL/Render.h
+++ b/Source/Core/VideoBackends/OGL/Render.h
@ -139,6 +139,8 @@ public:
  void DispatchComputeShader(const AbstractShader* shader, const void* uniforms, u32 uniforms_size,
                             u32 groups_x, u32 groups_y, u32 groups_z) override;

+  std::unique_ptr<VideoCommon::AsyncShaderCompiler> CreateAsyncShaderCompiler() override;
+
 private:
  void UpdateEFBCache(EFBAccessType type, u32 cacheRectIdx, const EFBRectangle& efbPixelRc,
                      const TargetRectangle& targetPixelRc, const void* data);
--- a/Source/Core/VideoBackends/Software/SWmain.cpp
+++ b/Source/Core/VideoBackends/Software/SWmain.cpp
@ -73,6 +73,7 @@ void VideoSoftware::InitBackendInfo()
  g_Config.backend_info.bSupportsBPTCTextures = false;
  g_Config.backend_info.bSupportsCopyToVram = false;
  g_Config.backend_info.bSupportsFramebufferFetch = false;
+  g_Config.backend_info.bSupportsBackgroundCompiling = false;

  // aamodes
  g_Config.backend_info.AAModes = {1};
--- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
@ -235,6 +235,7 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config)
  config->backend_info.bSupportsBitfield = true;              // Assumed support.
  config->backend_info.bSupportsDynamicSamplerIndexing = true;     // Assumed support.
  config->backend_info.bSupportsPostProcessing = true;             // Assumed support.
+  config->backend_info.bSupportsBackgroundCompiling = true;        // Assumed support.
  config->backend_info.bSupportsDualSourceBlend = false;           // Dependent on features.
  config->backend_info.bSupportsGeometryShaders = false;           // Dependent on features.
  config->backend_info.bSupportsGSInstancing = false;              // Dependent on features.
--- a/Source/Core/VideoCommon/DriverDetails.cpp
+++ b/Source/Core/VideoCommon/DriverDetails.cpp
@ -102,8 +102,6 @@ static BugInfo m_known_bugs[] = {
     true},
    {API_OPENGL, OS_LINUX, VENDOR_MESA, DRIVER_I965, Family::UNKNOWN,
     BUG_SHARED_CONTEXT_SHADER_COMPILATION, -1.0, -1.0, true},
-    {API_OPENGL, OS_LINUX, VENDOR_MESA, DRIVER_NOUVEAU, Family::UNKNOWN,
-     BUG_SHARED_CONTEXT_SHADER_COMPILATION, -1.0, -1.0, true},
    {API_VULKAN, OS_ALL, VENDOR_NVIDIA, DRIVER_NVIDIA, Family::UNKNOWN, BUG_BROKEN_MSAA_CLEAR, -1.0,
     -1.0, true},
    {API_VULKAN, OS_ALL, VENDOR_IMGTEC, DRIVER_IMGTEC, Family::UNKNOWN,
--- a/Source/Core/VideoCommon/DriverDetails.h
+++ b/Source/Core/VideoCommon/DriverDetails.h
@ -252,8 +252,10 @@ enum Bug
  // the negated value to a temporary variable then using that in the bitwise op.
  BUG_BROKEN_BITWISE_OP_NEGATION,

-  // Bug: Shaders are recompiled on the main thread after being previously compiled on
-  // a worker thread on Mesa i965.
+  // BUG: The GPU shader code appears to be context-specific on Mesa/i965.
+  // This means that if we compiled the ubershaders asynchronously, they will be recompiled
+  // on the main thread the first time they are used, causing stutter. For now, disable
+  // asynchronous compilation on Mesa i965.
  // Started version: -1
  // Ended Version: -1
  BUG_SHARED_CONTEXT_SHADER_COMPILATION,
--- a/Source/Core/VideoCommon/RenderBase.cpp
+++ b/Source/Core/VideoCommon/RenderBase.cpp
@ -1027,3 +1027,8 @@ bool Renderer::UseVertexDepthRange() const
  // in the vertex shader.
  return fabs(xfmem.viewport.zRange) > 16777215.0f || fabs(xfmem.viewport.farZ) > 16777215.0f;
 }
+
+std::unique_ptr<VideoCommon::AsyncShaderCompiler> Renderer::CreateAsyncShaderCompiler()
+{
+  return std::make_unique<VideoCommon::AsyncShaderCompiler>();
+}
--- a/Source/Core/VideoCommon/RenderBase.h
+++ b/Source/Core/VideoCommon/RenderBase.h
@ -28,6 +28,7 @@
 #include "Common/Flag.h"
 #include "Common/MathUtil.h"
 #include "VideoCommon/AVIDump.h"
+#include "VideoCommon/AsyncShaderCompiler.h"
 #include "VideoCommon/BPMemory.h"
 #include "VideoCommon/FPSCounter.h"
 #include "VideoCommon/RenderState.h"
@ -189,6 +190,8 @@ public:
  void ResizeSurface(int new_width, int new_height);
  bool UseVertexDepthRange() const;

+  virtual std::unique_ptr<VideoCommon::AsyncShaderCompiler> CreateAsyncShaderCompiler();
+
  virtual void Shutdown();

  // Drawing utility shaders.
--- a/Source/Core/VideoCommon/ShaderCache.cpp
+++ b/Source/Core/VideoCommon/ShaderCache.cpp
@ -27,7 +27,7 @@ bool ShaderCache::Initialize()
  m_efb_multisamples = g_ActiveConfig.iMultisamples;

  // Create the async compiler, and start the worker threads.
-  m_async_shader_compiler = std::make_unique<VideoCommon::AsyncShaderCompiler>();
+  m_async_shader_compiler = g_renderer->CreateAsyncShaderCompiler();
  m_async_shader_compiler->ResizeWorkerThreads(g_ActiveConfig.GetShaderPrecompilerThreads());

  // Load shader and UID caches.
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@ -187,8 +187,7 @@ static u32 GetNumAutoShaderCompilerThreads()

 u32 VideoConfig::GetShaderCompilerThreads() const
 {
-  // videocommon shader cache is currently broken on OGL, needs multiple contexts.
-  if (backend_info.api_type == APIType::OpenGL)
+  if (!backend_info.bSupportsBackgroundCompiling)
    return 0;

  if (iShaderCompilerThreads >= 0)
@ -199,8 +198,7 @@ u32 VideoConfig::GetShaderCompilerThreads() const

 u32 VideoConfig::GetShaderPrecompilerThreads() const
 {
-  // videocommon shader cache is currently broken on OGL, needs multiple contexts.
-  if (backend_info.api_type == APIType::OpenGL)
+  if (!backend_info.bSupportsBackgroundCompiling)
    return 0;

  if (iShaderPrecompilerThreads >= 0)
--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@ -227,6 +227,7 @@ struct VideoConfig final
    bool bSupportsDynamicSamplerIndexing;  // Needed by UberShaders, so must stay in VideoCommon
    bool bSupportsBPTCTextures;
    bool bSupportsFramebufferFetch;  // Used as an alternative to dual-source blend on GLES
+    bool bSupportsBackgroundCompiling;
  } backend_info;

  // Utility