Bug 1687157 - Support 24-bit depth in SWGL. r=jrmuizel

It is possible to support 24-bit depth in SWGL without a large performance hit and without increasing the size of the depth buffer. Since depth runs already have 32-bit entries, if we carefully limit the depth run size to 8 bits we have 24 bits left over to store the actual depth value. Differential Revision: https://phabricator.services.mozilla.com/D107409
2024-11-24 21:31:04 +00:00 · 2021-03-09 02:01:26 +00:00 · 2021-03-09 02:01:26 +00:00 · 7390f42dc3
commit 7390f42dc3
parent f36c63a891
7 changed files with 4038 additions and 49 deletions
--- a/gfx/tests/reftest/1687157-1-ref.html
+++ b/gfx/tests/reftest/1687157-1-ref.html
--- a/gfx/tests/reftest/1687157-1.html
+++ b/gfx/tests/reftest/1687157-1.html
--- a/gfx/tests/reftest/reftest.list
+++ b/gfx/tests/reftest/reftest.list
@ -27,3 +27,4 @@ fuzzy-if(webrender,2-7,17500-36908) == 1523776.html 1523776-ref.html
 skip-if(!asyncPan||!webrender||Android) fuzzy-if(winWidget,94-94,3415-3419) fuzzy-if(cocoaWidget&&swgl,1-1,1-1) pref(apz.allow_zooming,true) == picture-caching-on-async-zoom.html picture-caching-on-async-zoom.html?ref
 pref(apz.allow_zooming,true) == 1662062-1-no-blurry.html 1662062-1-ref.html
 == 1681610.html 1681610-ref.html
+skip-if(!webrender) fuzzy-if(webrender,0-255,0-60) == 1687157-1.html 1687157-1-ref.html
--- a/gfx/wr/swgl/src/gl.cc
+++ b/gfx/wr/swgl/src/gl.cc
@ -400,8 +400,8 @@ struct Texture {
  uint32_t clear_val = 0;
  uint32_t* cleared_rows = nullptr;

-  void init_depth_runs(uint16_t z);
-  void fill_depth_runs(uint16_t z, const IntRect& scissor);
+  void init_depth_runs(uint32_t z);
+  void fill_depth_runs(uint32_t z, const IntRect& scissor);

  void enable_delayed_clear(uint32_t val) {
    delay_clear = height;
@ -472,7 +472,7 @@ struct Texture {
        // just to be safe. All other texture types and use-cases should be
        // safe to omit padding.
        size_t padding =
-            internal_format == GL_DEPTH_COMPONENT16 || max(width, min_width) < 2
+            internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2
                ? sizeof(Float)
                : 0;
        char* new_buf = (char*)realloc(buf, size + padding);
@ -1562,7 +1562,7 @@ void PixelStorei(GLenum name, GLint param) {
 static GLenum remap_internal_format(GLenum format) {
  switch (format) {
    case GL_DEPTH_COMPONENT:
-      return GL_DEPTH_COMPONENT16;
+      return GL_DEPTH_COMPONENT24;
    case GL_RGBA:
      return GL_RGBA8;
    case GL_RED:
@ -1854,10 +1854,11 @@ void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width,
  }
  switch (internal_format) {
    case GL_DEPTH_COMPONENT:
+    case GL_DEPTH_COMPONENT16:
    case GL_DEPTH_COMPONENT24:
    case GL_DEPTH_COMPONENT32:
-      // Force depth format to 16 bits...
-      internal_format = GL_DEPTH_COMPONENT16;
+      // Force depth format to 24 bits...
+      internal_format = GL_DEPTH_COMPONENT24;
      break;
  }
  set_tex_storage(ctx->textures[r.texture], internal_format, width, height);
@ -2240,7 +2241,7 @@ void InitDefaultFramebuffer(int x, int y, int width, int height, int stride,
  }
  // Ensure dimensions of the depth buffer match the color buffer.
  Texture& depthtex = ctx->textures[fb.depth_attachment];
-  set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height);
+  set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height);
  depthtex.offset = IntPoint(x, y);
 }

@ -2292,19 +2293,16 @@ void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset,
  }
  assert(zoffset == 0 && depth == 1);
  IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height};
-  if (t.internal_format == GL_DEPTH_COMPONENT16) {
-    uint16_t value = 0xFFFF;
+  if (t.internal_format == GL_DEPTH_COMPONENT24) {
+    uint32_t value = 0xFFFFFF;
    switch (format) {
      case GL_DEPTH_COMPONENT:
        switch (type) {
          case GL_DOUBLE:
-            value = uint16_t(*(const GLdouble*)data * 0xFFFF);
+            value = uint32_t(*(const GLdouble*)data * 0xFFFFFF);
            break;
          case GL_FLOAT:
-            value = uint16_t(*(const GLfloat*)data * 0xFFFF);
-            break;
-          case GL_UNSIGNED_SHORT:
-            value = uint16_t(*(const GLushort*)data);
+            value = uint32_t(*(const GLfloat*)data * 0xFFFFFF);
            break;
          default:
            assert(false);
@ -2629,7 +2627,7 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
         colortex.internal_format == GL_R8);
  Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0];
  if (depthtex.buf) {
-    assert(depthtex.internal_format == GL_DEPTH_COMPONENT16);
+    assert(depthtex.internal_format == GL_DEPTH_COMPONENT24);
    assert(colortex.width == depthtex.width &&
           colortex.height == depthtex.height);
    assert(colortex.offset == depthtex.offset);
--- a/gfx/wr/swgl/src/rasterize.h
+++ b/gfx/wr/swgl/src/rasterize.h
@ -24,35 +24,42 @@
 // the DepthRun struct can be interpreted as a sign-extended int32_t depth. It
 // is then possible to just treat the entire row as an array of int32_t depth
 // samples that can be processed with SIMD comparisons, since the count field
-// behaves as just the sign-extension of the depth field.
-// When a depth buffer is cleared, each row is initialized to a single run
+// behaves as just the sign-extension of the depth field. The count field is
+// limited to 8 bits so that we can support depth values up to 24 bits.
+// When a depth buffer is cleared, each row is initialized to a maximal runs
 // spanning the entire row. In the normal case, the depth buffer will continue
 // to manage itself as a list of runs. If perspective or discard is used for
 // a given row, the row will be converted to the flattened representation to
 // support it, after which it will only ever revert back to runs if the depth
 // buffer is cleared.
+
+// The largest 24-bit depth value supported.
+constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF;
+// The longest 8-bit depth run that is supported, aligned to SIMD chunk size.
+constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3;
+
 struct DepthRun {
  // Ensure that depth always occupies the LSB and count the MSB so that we
  // can sign-extend depth just by setting count to zero, marking it flat.
  // When count is non-zero, then this is interpreted as an actual run and
  // depth is read in isolation.
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  uint16_t depth;
-  uint16_t count;
+  uint32_t depth : 24;
+  uint32_t count : 8;
 #else
-  uint16_t count;
-  uint16_t depth;
+  uint32_t count : 8;
+  uint32_t depth : 24;
 #endif

  DepthRun() = default;
-  DepthRun(uint16_t depth, uint16_t count) : depth(depth), count(count) {}
+  DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {}

  // If count is zero, this is actually a flat depth sample rather than a run.
  bool is_flat() const { return !count; }

  // Compare a source depth from rasterization with a stored depth value.
  template <int FUNC>
-  ALWAYS_INLINE bool compare(uint16_t src) const {
+  ALWAYS_INLINE bool compare(uint32_t src) const {
    switch (FUNC) {
      case GL_LEQUAL:
        return src <= depth;
@ -67,6 +74,22 @@ struct DepthRun {
  }
 };

+// Fills runs at the given position with the given depth up to the span width.
+static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth,
+                                         uint32_t width) {
+  // If the width exceeds the maximum run size, then we need to output clamped
+  // runs first.
+  for (; width >= MAX_DEPTH_RUN;
+       runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) {
+    *runs = DepthRun(depth, MAX_DEPTH_RUN);
+  }
+  // If there are still any left over samples to fill under the maximum run
+  // size, then output one last run for them.
+  if (width > 0) {
+    *runs = DepthRun(depth, width);
+  }
+}
+
 // A cursor for reading and modifying a row's depth run array. It locates
 // and iterates through a desired span within all the runs, testing if
 // the depth of this span passes or fails the depth test against existing
@ -128,7 +151,7 @@ struct DepthCursor {
  // so it is safe for the caller to stop processing any more regions in this
  // row.
  template <int FUNC>
-  int skip_failed(uint16_t val) {
+  int skip_failed(uint32_t val) {
    assert(valid());
    DepthRun* prev = start;
    while (cur < end) {
@ -143,7 +166,7 @@ struct DepthCursor {

  // Helper to convert function parameters into template parameters to hoist
  // some checks out of inner loops.
-  ALWAYS_INLINE int skip_failed(uint16_t val, GLenum func) {
+  ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) {
    switch (func) {
      case GL_LEQUAL:
        return skip_failed<GL_LEQUAL>(val);
@ -162,7 +185,7 @@ struct DepthCursor {
  // to represent this new region that passed the depth test. The length of the
  // region is returned.
  template <int FUNC, bool MASK>
-  int check_passed(uint16_t val) {
+  int check_passed(uint32_t val) {
    assert(valid());
    DepthRun* prev = cur;
    while (cur < end) {
@ -201,7 +224,7 @@ struct DepthCursor {
        prev->count = start - prev;
      }
      // Create a new run for the entirety of the passed samples.
-      *start = DepthRun(val, passed);
+      set_depth_runs(start, val, passed);
    }
    start = cur;
    return passed;
@ -210,7 +233,7 @@ struct DepthCursor {
  // Helper to convert function parameters into template parameters to hoist
  // some checks out of inner loops.
  template <bool MASK>
-  ALWAYS_INLINE int check_passed(uint16_t val, GLenum func) {
+  ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) {
    switch (func) {
      case GL_LEQUAL:
        return check_passed<GL_LEQUAL, MASK>(val);
@ -222,37 +245,37 @@ struct DepthCursor {
    }
  }

-  ALWAYS_INLINE int check_passed(uint16_t val, GLenum func, bool mask) {
+  ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) {
    return mask ? check_passed<true>(val, func)
                : check_passed<false>(val, func);
  }

  // Fill a region of runs with a given depth value, bypassing any depth test.
-  ALWAYS_INLINE void fill(uint16_t depth) {
+  ALWAYS_INLINE void fill(uint32_t depth) {
    check_passed<GL_ALWAYS, true>(depth);
  }
 };

 // Initialize a depth texture by setting the first run in each row to encompass
 // the entire row.
-void Texture::init_depth_runs(uint16_t depth) {
+void Texture::init_depth_runs(uint32_t depth) {
  if (!buf) return;
  DepthRun* runs = (DepthRun*)buf;
  for (int y = 0; y < height; y++) {
-    runs[0] = DepthRun(depth, width);
+    set_depth_runs(runs, depth, width);
    runs += stride() / sizeof(DepthRun);
  }
  set_cleared(true);
 }

 // Fill a portion of the run array with flattened depth samples.
-static ALWAYS_INLINE void fill_depth_run(DepthRun* dst, size_t n,
-                                         uint16_t depth) {
-  fill_n((uint32_t*)dst, n, uint32_t(depth));
+static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n,
+                                          uint32_t depth) {
+  fill_n((uint32_t*)dst, n, depth);
 }

 // Fills a scissored region of a depth texture with a given depth.
-void Texture::fill_depth_runs(uint16_t depth, const IntRect& scissor) {
+void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) {
  if (!buf) return;
  assert(cleared());
  IntRect bb = bounds().intersection(scissor - offset);
@ -261,10 +284,10 @@ void Texture::fill_depth_runs(uint16_t depth, const IntRect& scissor) {
    if (bb.width() >= width) {
      // If the scissor region encompasses the entire row, reset the row to a
      // single run encompassing the entire row.
-      runs[0] = DepthRun(depth, width);
+      set_depth_runs(runs, depth, width);
    } else if (runs->is_flat()) {
      // If the row is flattened, just directly fill the portion of the row.
-      fill_depth_run(&runs[bb.x0], bb.width(), depth);
+      fill_flat_depth(&runs[bb.x0], bb.width(), depth);
    } else {
      // Otherwise, if we are still using runs, then set up a cursor to fill
      // it with depth runs.
@ -320,7 +343,7 @@ static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask,
 }

 static ALWAYS_INLINE I32 packDepth() {
-  return cast(fragment_shader->gl_FragCoord.z * 0xFFFF);
+  return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE);
 }

 static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) {
@ -547,7 +570,7 @@ static void flatten_depth_runs(DepthRun* runs, size_t width) {
  }
  while (width > 0) {
    size_t n = runs->count;
-    fill_depth_run(runs, n, runs->depth);
+    fill_flat_depth(runs, n, runs->depth);
    runs += n;
    width -= n;
  }
@ -556,7 +579,7 @@ static void flatten_depth_runs(DepthRun* runs, size_t width) {
 // Helper function for drawing passed depth runs within the depth buffer.
 // Flattened depth (perspective or discard) is not supported.
 template <typename P>
-static ALWAYS_INLINE void draw_depth_span(uint16_t z, P* buf,
+static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf,
                                          DepthCursor& cursor) {
  for (;;) {
    // Get the span that passes the depth test. Assume on entry that
@ -614,7 +637,7 @@ template <bool DISCARD, bool W, typename P, typename Z>
 static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) {
  if (depth) {
    // Depth testing is enabled. If perspective is used, Z values will vary
-    // across the span, we use packDepth to generate 16-bit Z values suitable
+    // across the span, we use packDepth to generate packed Z values suitable
    // for depth testing based on current values from gl_FragCoord.z.
    // Otherwise, for the no-perspective case, we just use the provided Z.
    // Process 4-pixel chunks first.
@ -662,7 +685,7 @@ static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) {
 template <typename P>
 static inline void prepare_row(Texture& colortex, int y, int startx, int endx,
                               bool use_discard, DepthRun* depth,
-                               uint16_t z = 0, DepthCursor* cursor = nullptr) {
+                               uint32_t z = 0, DepthCursor* cursor = nullptr) {
  assert(colortex.delay_clear > 0);
  // Delayed clear is enabled for the color buffer. Check if needs clear.
  uint32_t& mask = colortex.cleared_rows[y / 32];
@ -735,7 +758,7 @@ static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) {
 // assumed to be ordered in either CW or CCW to support this, but currently
 // both orders (CW and CCW) are supported and equivalent.
 template <typename P>
-static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z,
+static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z,
                                   Interpolants interp_outs[4],
                                   Texture& colortex, Texture& depthtex,
                                   const ClipRect& clipRect) {
@ -1534,7 +1557,7 @@ static void draw_quad(int nump, Texture& colortex, Texture& depthtex) {
  }
  // Since Z doesn't need to be interpolated, just set the fragment shader's
  // Z and W values here, once and for all fragment shader invocations.
-  uint16_t z = uint16_t(0xFFFF * screenZ);
+  uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ);
  fragment_shader->gl_FragCoord.z = screenZ;
  fragment_shader->gl_FragCoord.w = w;

--- a/gfx/wr/webrender/src/compositor/sw_compositor.rs
+++ b/gfx/wr/webrender/src/compositor/sw_compositor.rs
@ -1517,7 +1517,7 @@ impl Compositor for SwCompositor {
                // tile size is not bigger than what was previously allocated.
                self.gl.set_texture_buffer(
                    self.depth_id,
-                    gl::DEPTH_COMPONENT16,
+                    gl::DEPTH_COMPONENT,
                    valid_rect.size.width,
                    valid_rect.size.height,
                    0,
--- a/gfx/wr/webrender/src/device/gl.rs
+++ b/gfx/wr/webrender/src/device/gl.rs
@ -1525,11 +1525,14 @@ impl Device {
        };

        let is_software_webrender = renderer_name.starts_with("Software WebRender");
-        let (depth_format, upload_method) = if is_software_webrender {
-            (gl::DEPTH_COMPONENT16, UploadMethod::Immediate)
+        let upload_method = if is_software_webrender {
+            // Uploads in SWGL generally reduce to simple memory copies.
+            UploadMethod::Immediate
        } else {
-            (gl::DEPTH_COMPONENT24, upload_method)
+            upload_method
        };
+        // Prefer 24-bit depth format. While 16-bit depth also works, it may exhaust depth ids easily.
+        let depth_format = gl::DEPTH_COMPONENT24;

        info!("GL texture cache {:?}, bgra {:?} swizzle {:?}, texture storage {:?}, depth {:?}",
            color_formats, bgra_formats, bgra8_sampling_swizzle, texture_storage_usage, depth_format);