Bug 1870395 - gfxAlphaRecovery - Neon version r=jrmuizel,gfx-reviewers

Depends on D196459 Differential Revision: https://phabricator.services.mozilla.com/D196860
2025-02-25 20:01:50 +00:00 · 2024-01-04 10:11:51 +00:00 · 2024-01-04 10:11:51 +00:00 · fd7b78a368
commit fd7b78a368
parent 0e607b2b3f
5 changed files with 157 additions and 134 deletions
--- a/gfx/thebes/gfxAlphaRecovery.cpp
+++ b/gfx/thebes/gfxAlphaRecovery.cpp
@ -10,6 +10,8 @@
 #define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
 #include "mozilla/SSE.h"

+#include <xsimd/xsimd.hpp>
+
 /* static */
 bool gfxAlphaRecovery::RecoverAlpha(gfxImageSurface* blackSurf,
                                    const gfxImageSurface* whiteSurf) {
@ -23,7 +25,14 @@ bool gfxAlphaRecovery::RecoverAlpha(gfxImageSurface* blackSurf,
    return false;

 #ifdef MOZILLA_MAY_SUPPORT_SSE2
-  if (mozilla::supports_sse2() && RecoverAlphaSSE2(blackSurf, whiteSurf)) {
+  if (mozilla::supports_sse2() &&
+      RecoverAlphaGeneric<xsimd::sse2>(blackSurf, whiteSurf)) {
+    return true;
+  }
+#endif
+#ifdef MOZILLA_MAY_SUPPORT_NEON
+  if (mozilla::supports_neon() &&
+      RecoverAlphaGeneric<xsimd::neon>(blackSurf, whiteSurf)) {
    return true;
  }
 #endif
--- a/gfx/thebes/gfxAlphaRecovery.h
+++ b/gfx/thebes/gfxAlphaRecovery.h
@ -6,7 +6,6 @@
 #ifndef _GFXALPHARECOVERY_H_
 #define _GFXALPHARECOVERY_H_

-#include "mozilla/SSE.h"
 #include "gfxTypes.h"
 #include "mozilla/gfx/Rect.h"

@ -33,14 +32,12 @@ class gfxAlphaRecovery {
  static bool RecoverAlpha(gfxImageSurface* blackSurface,
                           const gfxImageSurface* whiteSurface);

-#ifdef MOZILLA_MAY_SUPPORT_SSE2
-  /* This does the same as the previous function, but uses SSE2
-   * optimizations. Usually this should not be called directly.  Be sure to
-   * check mozilla::supports_sse2() before calling this function.
+  /* This does the same as the previous function, but uses SIMD
+   * optimizations. Usually this should not be called directly.
   */
-  static bool RecoverAlphaSSE2(gfxImageSurface* blackSurface,
-                               const gfxImageSurface* whiteSurface);
-#endif
+  template <class Arch>
+  static bool RecoverAlphaGeneric(gfxImageSurface* blackSurface,
+                                  const gfxImageSurface* whiteSurface);

  /** from cairo-xlib-utils.c, modified */
  /**
@ -62,7 +59,7 @@ class gfxAlphaRecovery {
   * bits are likely to be the most accurate.
   *
   * This function needs to be in the header file since it's used by both
-   * gfxRecoverAlpha.cpp and gfxRecoverAlphaSSE2.cpp.
+   * gfxRecoverAlpha.cpp and gfxRecoverAlphaGeneric.hpp.
   */

  static inline uint32_t RecoverPixel(uint32_t black, uint32_t white) {
--- a/gfx/thebes/gfxAlphaRecoveryGeneric.h
+++ b/gfx/thebes/gfxAlphaRecoveryGeneric.h
@ -0,0 +1,129 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#ifndef _GFXALPHARECOVERY_GENERIC_H_
+#define _GFXALPHARECOVERY_GENERIC_H_
+
+#include "gfxAlphaRecovery.h"
+#include "gfxImageSurface.h"
+#include "nsDebug.h"
+#include <xsimd/xsimd.hpp>
+
+template <typename Arch>
+bool gfxAlphaRecovery::RecoverAlphaGeneric(gfxImageSurface* blackSurf,
+                                           const gfxImageSurface* whiteSurf) {
+  mozilla::gfx::IntSize size = blackSurf->GetSize();
+
+  if (size != whiteSurf->GetSize() ||
+      (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
+       blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
+      (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
+       whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
+    return false;
+
+  blackSurf->Flush();
+  whiteSurf->Flush();
+
+  unsigned char* blackData = blackSurf->Data();
+  unsigned char* whiteData = whiteSurf->Data();
+
+  if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
+          (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
+      (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
+    // Cannot keep these in alignment.
+    return false;
+  }
+
+  alignas(Arch::alignment()) static const uint8_t greenMaski[] = {
+      0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
+      0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
+  };
+  alignas(Arch::alignment()) static const uint8_t alphaMaski[] = {
+      0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+      0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+  };
+
+  using batch_type = xsimd::batch<uint8_t, Arch>;
+  constexpr size_t batch_size = batch_type::size;
+  static_assert(batch_size == 16);
+
+  batch_type greenMask = batch_type::load_aligned(greenMaski);
+  batch_type alphaMask = batch_type::load_aligned(alphaMaski);
+
+  for (int32_t i = 0; i < size.height; ++i) {
+    int32_t j = 0;
+    // Loop single pixels until at 4 byte alignment.
+    while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
+      *((uint32_t*)blackData) =
+          RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
+                       *reinterpret_cast<uint32_t*>(whiteData));
+      blackData += 4;
+      whiteData += 4;
+      j++;
+    }
+    // This extra loop allows the compiler to do some more clever registry
+    // management and makes it about 5% faster than with only the 4 pixel
+    // at a time loop.
+    for (; j < size.width - 8; j += 8) {
+      auto black1 = batch_type::load_aligned(blackData);
+      auto white1 = batch_type::load_aligned(whiteData);
+      auto black2 = batch_type::load_aligned(blackData + batch_size);
+      auto white2 = batch_type::load_aligned(whiteData + batch_size);
+
+      // Execute the same instructions as described in RecoverPixel, only
+      // using an SSE2 packed saturated subtract.
+      white1 = xsimd::ssub(white1, black1);
+      white2 = xsimd::ssub(white2, black2);
+      white1 = xsimd::ssub(greenMask, white1);
+      white2 = xsimd::ssub(greenMask, white2);
+      // Producing the final black pixel in an XMM register and storing
+      // that is actually faster than doing a masked store since that
+      // does an unaligned storage. We have the black pixel in a register
+      // anyway.
+      black1 = xsimd::bitwise_andnot(black1, alphaMask);
+      black2 = xsimd::bitwise_andnot(black2, alphaMask);
+      white1 = xsimd::slide_left<2>(white1);
+      white2 = xsimd::slide_left<2>(white2);
+      white1 &= alphaMask;
+      white2 &= alphaMask;
+      black1 |= white1;
+      black2 |= white2;
+
+      black1.store_aligned(blackData);
+      black2.store_aligned(blackData + batch_size);
+      blackData += 2 * batch_size;
+      whiteData += 2 * batch_size;
+    }
+    for (; j < size.width - 4; j += 4) {
+      auto black = batch_type::load_aligned(blackData);
+      auto white = batch_type::load_aligned(whiteData);
+
+      white = xsimd::ssub(white, black);
+      white = xsimd::ssub(greenMask, white);
+      black = xsimd::bitwise_andnot(black, alphaMask);
+      white = xsimd::slide_left<2>(white);
+      white &= alphaMask;
+      black |= white;
+      black.store_aligned(blackData);
+      blackData += batch_size;
+      whiteData += batch_size;
+    }
+    // Loop single pixels until we're done.
+    while (j < size.width) {
+      *((uint32_t*)blackData) =
+          RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
+                       *reinterpret_cast<uint32_t*>(whiteData));
+      blackData += 4;
+      whiteData += 4;
+      j++;
+    }
+    blackData += blackSurf->Stride() - j * 4;
+    whiteData += whiteSurf->Stride() - j * 4;
+  }
+
+  blackSurf->MarkDirty();
+
+  return true;
+}
+#endif
--- a/gfx/thebes/gfxAlphaRecoveryNeon.cpp
+++ b/gfx/thebes/gfxAlphaRecoveryNeon.cpp
@ -0,0 +1,9 @@
+/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "gfxAlphaRecoveryGeneric.h"
+
+template bool gfxAlphaRecoveryGeneric::RecoverAlpha<xsimd::neon>(
+    gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf);
--- a/gfx/thebes/gfxAlphaRecoverySSE2.cpp
+++ b/gfx/thebes/gfxAlphaRecoverySSE2.cpp
@ -3,128 +3,7 @@
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

-#include "gfxAlphaRecovery.h"
-#include "gfxImageSurface.h"
-#include "nsDebug.h"
-#include <xsimd/xsimd.hpp>
+#include "gfxAlphaRecoveryGeneric.h"

-using arch = xsimd::sse2;
-
-// This file should only be compiled on x86 and x64 systems.  Additionally,
-// you'll need to compile it with -msse2 if you're using GCC on x86.
-
-alignas(arch::alignment()) static const uint8_t greenMaski[] = {
-    0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
-    0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
-};
-alignas(arch::alignment()) static const uint8_t alphaMaski[] = {
-    0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
-    0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
-};
-
-bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
-                                        const gfxImageSurface* whiteSurf) {
-  mozilla::gfx::IntSize size = blackSurf->GetSize();
-
-  if (size != whiteSurf->GetSize() ||
-      (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
-       blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
-      (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
-       whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
-    return false;
-
-  blackSurf->Flush();
-  whiteSurf->Flush();
-
-  unsigned char* blackData = blackSurf->Data();
-  unsigned char* whiteData = whiteSurf->Data();
-
-  if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
-          (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
-      (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
-    // Cannot keep these in alignment.
-    return false;
-  }
-
-  using batch_type = xsimd::batch<uint8_t, arch>;
-  constexpr size_t batch_size = batch_type::size;
-  static_assert(batch_size == 16);
-
-  batch_type greenMask = batch_type::load_aligned(greenMaski);
-  batch_type alphaMask = batch_type::load_aligned(alphaMaski);
-
-  for (int32_t i = 0; i < size.height; ++i) {
-    int32_t j = 0;
-    // Loop single pixels until at 4 byte alignment.
-    while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
-      *((uint32_t*)blackData) =
-          RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
-                       *reinterpret_cast<uint32_t*>(whiteData));
-      blackData += 4;
-      whiteData += 4;
-      j++;
-    }
-    // This extra loop allows the compiler to do some more clever registry
-    // management and makes it about 5% faster than with only the 4 pixel
-    // at a time loop.
-    for (; j < size.width - 8; j += 8) {
-      auto black1 = batch_type::load_aligned(blackData);
-      auto white1 = batch_type::load_aligned(whiteData);
-      auto black2 = batch_type::load_aligned(blackData + batch_size);
-      auto white2 = batch_type::load_aligned(whiteData + batch_size);
-
-      // Execute the same instructions as described in RecoverPixel, only
-      // using a packed saturated subtract.
-      white1 = xsimd::ssub(white1, black1);
-      white2 = xsimd::ssub(white2, black2);
-      white1 = xsimd::ssub(greenMask, white1);
-      white2 = xsimd::ssub(greenMask, white2);
-      // Producing the final black pixel in a register and storing
-      // that is actually faster than doing a masked store since that
-      // does an unaligned storage. We have the black pixel in a register
-      // anyway.
-      black1 = xsimd::bitwise_andnot(black1, alphaMask);
-      black2 = xsimd::bitwise_andnot(black2, alphaMask);
-      white1 = xsimd::slide_left<2>(white1);
-      white2 = xsimd::slide_left<2>(white2);
-      white1 &= alphaMask;
-      white2 &= alphaMask;
-      black1 |= white1;
-      black2 |= white2;
-
-      black1.store_aligned(blackData);
-      black2.store_aligned(blackData + batch_size);
-      blackData += 2 * batch_size;
-      whiteData += 2 * batch_size;
-    }
-    for (; j < size.width - 4; j += 4) {
-      auto black = batch_type::load_aligned(blackData);
-      auto white = batch_type::load_aligned(whiteData);
-
-      white = xsimd::ssub(white, black);
-      white = xsimd::ssub(greenMask, white);
-      black = xsimd::bitwise_andnot(black, alphaMask);
-      white = xsimd::slide_left<2>(white);
-      white &= alphaMask;
-      black |= white;
-      black.store_aligned(blackData);
-      blackData += batch_size;
-      whiteData += batch_size;
-    }
-    // Loop single pixels until we're done.
-    while (j < size.width) {
-      *((uint32_t*)blackData) =
-          RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
-                       *reinterpret_cast<uint32_t*>(whiteData));
-      blackData += 4;
-      whiteData += 4;
-      j++;
-    }
-    blackData += blackSurf->Stride() - j * 4;
-    whiteData += whiteSurf->Stride() - j * 4;
-  }
-
-  blackSurf->MarkDirty();
-
-  return true;
-}
+template bool gfxAlphaRecovery::RecoverAlphaGeneric<xsimd::sse2>(
+    gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf);