Bug 1870395 - gfxAlphaRecovery - Neon version r=jrmuizel,gfx-reviewers

Depends on D196459

Differential Revision: https://phabricator.services.mozilla.com/D196860
This commit is contained in:
serge-sans-paille 2024-01-04 10:11:51 +00:00
parent 0e607b2b3f
commit fd7b78a368
5 changed files with 157 additions and 134 deletions

View File

@ -10,6 +10,8 @@
#define MOZILLA_SSE_INCLUDE_HEADER_FOR_SSE2
#include "mozilla/SSE.h"
#include <xsimd/xsimd.hpp>
/* static */
bool gfxAlphaRecovery::RecoverAlpha(gfxImageSurface* blackSurf,
const gfxImageSurface* whiteSurf) {
@ -23,7 +25,14 @@ bool gfxAlphaRecovery::RecoverAlpha(gfxImageSurface* blackSurf,
return false;
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2() && RecoverAlphaSSE2(blackSurf, whiteSurf)) {
if (mozilla::supports_sse2() &&
RecoverAlphaGeneric<xsimd::sse2>(blackSurf, whiteSurf)) {
return true;
}
#endif
#ifdef MOZILLA_MAY_SUPPORT_NEON
if (mozilla::supports_neon() &&
RecoverAlphaGeneric<xsimd::neon>(blackSurf, whiteSurf)) {
return true;
}
#endif

View File

@ -6,7 +6,6 @@
#ifndef _GFXALPHARECOVERY_H_
#define _GFXALPHARECOVERY_H_
#include "mozilla/SSE.h"
#include "gfxTypes.h"
#include "mozilla/gfx/Rect.h"
@ -33,14 +32,12 @@ class gfxAlphaRecovery {
static bool RecoverAlpha(gfxImageSurface* blackSurface,
const gfxImageSurface* whiteSurface);
#ifdef MOZILLA_MAY_SUPPORT_SSE2
/* This does the same as the previous function, but uses SSE2
* optimizations. Usually this should not be called directly. Be sure to
* check mozilla::supports_sse2() before calling this function.
/* This does the same as the previous function, but uses SIMD
* optimizations. Usually this should not be called directly.
*/
static bool RecoverAlphaSSE2(gfxImageSurface* blackSurface,
const gfxImageSurface* whiteSurface);
#endif
template <class Arch>
static bool RecoverAlphaGeneric(gfxImageSurface* blackSurface,
const gfxImageSurface* whiteSurface);
/** from cairo-xlib-utils.c, modified */
/**
@ -62,7 +59,7 @@ class gfxAlphaRecovery {
* bits are likely to be the most accurate.
*
* This function needs to be in the header file since it's used by both
* gfxRecoverAlpha.cpp and gfxRecoverAlphaSSE2.cpp.
* gfxRecoverAlpha.cpp and gfxRecoverAlphaGeneric.hpp.
*/
static inline uint32_t RecoverPixel(uint32_t black, uint32_t white) {

View File

@ -0,0 +1,129 @@
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef _GFXALPHARECOVERY_GENERIC_H_
#define _GFXALPHARECOVERY_GENERIC_H_
#include "gfxAlphaRecovery.h"
#include "gfxImageSurface.h"
#include "nsDebug.h"
#include <xsimd/xsimd.hpp>
template <typename Arch>
bool gfxAlphaRecovery::RecoverAlphaGeneric(gfxImageSurface* blackSurf,
const gfxImageSurface* whiteSurf) {
mozilla::gfx::IntSize size = blackSurf->GetSize();
if (size != whiteSurf->GetSize() ||
(blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
(whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
return false;
blackSurf->Flush();
whiteSurf->Flush();
unsigned char* blackData = blackSurf->Data();
unsigned char* whiteData = whiteSurf->Data();
if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
(NS_PTR_TO_UINT32(whiteData) & 0xf) ||
(blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
// Cannot keep these in alignment.
return false;
}
alignas(Arch::alignment()) static const uint8_t greenMaski[] = {
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
};
alignas(Arch::alignment()) static const uint8_t alphaMaski[] = {
0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
};
using batch_type = xsimd::batch<uint8_t, Arch>;
constexpr size_t batch_size = batch_type::size;
static_assert(batch_size == 16);
batch_type greenMask = batch_type::load_aligned(greenMaski);
batch_type alphaMask = batch_type::load_aligned(alphaMaski);
for (int32_t i = 0; i < size.height; ++i) {
int32_t j = 0;
// Loop single pixels until at 4 byte alignment.
while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
*((uint32_t*)blackData) =
RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
*reinterpret_cast<uint32_t*>(whiteData));
blackData += 4;
whiteData += 4;
j++;
}
// This extra loop allows the compiler to do some more clever registry
// management and makes it about 5% faster than with only the 4 pixel
// at a time loop.
for (; j < size.width - 8; j += 8) {
auto black1 = batch_type::load_aligned(blackData);
auto white1 = batch_type::load_aligned(whiteData);
auto black2 = batch_type::load_aligned(blackData + batch_size);
auto white2 = batch_type::load_aligned(whiteData + batch_size);
// Execute the same instructions as described in RecoverPixel, only
// using an SSE2 packed saturated subtract.
white1 = xsimd::ssub(white1, black1);
white2 = xsimd::ssub(white2, black2);
white1 = xsimd::ssub(greenMask, white1);
white2 = xsimd::ssub(greenMask, white2);
// Producing the final black pixel in an XMM register and storing
// that is actually faster than doing a masked store since that
// does an unaligned storage. We have the black pixel in a register
// anyway.
black1 = xsimd::bitwise_andnot(black1, alphaMask);
black2 = xsimd::bitwise_andnot(black2, alphaMask);
white1 = xsimd::slide_left<2>(white1);
white2 = xsimd::slide_left<2>(white2);
white1 &= alphaMask;
white2 &= alphaMask;
black1 |= white1;
black2 |= white2;
black1.store_aligned(blackData);
black2.store_aligned(blackData + batch_size);
blackData += 2 * batch_size;
whiteData += 2 * batch_size;
}
for (; j < size.width - 4; j += 4) {
auto black = batch_type::load_aligned(blackData);
auto white = batch_type::load_aligned(whiteData);
white = xsimd::ssub(white, black);
white = xsimd::ssub(greenMask, white);
black = xsimd::bitwise_andnot(black, alphaMask);
white = xsimd::slide_left<2>(white);
white &= alphaMask;
black |= white;
black.store_aligned(blackData);
blackData += batch_size;
whiteData += batch_size;
}
// Loop single pixels until we're done.
while (j < size.width) {
*((uint32_t*)blackData) =
RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
*reinterpret_cast<uint32_t*>(whiteData));
blackData += 4;
whiteData += 4;
j++;
}
blackData += blackSurf->Stride() - j * 4;
whiteData += whiteSurf->Stride() - j * 4;
}
blackSurf->MarkDirty();
return true;
}
#endif

View File

@ -0,0 +1,9 @@
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "gfxAlphaRecoveryGeneric.h"
template bool gfxAlphaRecoveryGeneric::RecoverAlpha<xsimd::neon>(
gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf);

View File

@ -3,128 +3,7 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "gfxAlphaRecovery.h"
#include "gfxImageSurface.h"
#include "nsDebug.h"
#include <xsimd/xsimd.hpp>
#include "gfxAlphaRecoveryGeneric.h"
using arch = xsimd::sse2;
// This file should only be compiled on x86 and x64 systems. Additionally,
// you'll need to compile it with -msse2 if you're using GCC on x86.
alignas(arch::alignment()) static const uint8_t greenMaski[] = {
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
};
alignas(arch::alignment()) static const uint8_t alphaMaski[] = {
0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
};
bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
const gfxImageSurface* whiteSurf) {
mozilla::gfx::IntSize size = blackSurf->GetSize();
if (size != whiteSurf->GetSize() ||
(blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
(whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
return false;
blackSurf->Flush();
whiteSurf->Flush();
unsigned char* blackData = blackSurf->Data();
unsigned char* whiteData = whiteSurf->Data();
if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
(NS_PTR_TO_UINT32(whiteData) & 0xf) ||
(blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
// Cannot keep these in alignment.
return false;
}
using batch_type = xsimd::batch<uint8_t, arch>;
constexpr size_t batch_size = batch_type::size;
static_assert(batch_size == 16);
batch_type greenMask = batch_type::load_aligned(greenMaski);
batch_type alphaMask = batch_type::load_aligned(alphaMaski);
for (int32_t i = 0; i < size.height; ++i) {
int32_t j = 0;
// Loop single pixels until at 4 byte alignment.
while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
*((uint32_t*)blackData) =
RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
*reinterpret_cast<uint32_t*>(whiteData));
blackData += 4;
whiteData += 4;
j++;
}
// This extra loop allows the compiler to do some more clever registry
// management and makes it about 5% faster than with only the 4 pixel
// at a time loop.
for (; j < size.width - 8; j += 8) {
auto black1 = batch_type::load_aligned(blackData);
auto white1 = batch_type::load_aligned(whiteData);
auto black2 = batch_type::load_aligned(blackData + batch_size);
auto white2 = batch_type::load_aligned(whiteData + batch_size);
// Execute the same instructions as described in RecoverPixel, only
// using a packed saturated subtract.
white1 = xsimd::ssub(white1, black1);
white2 = xsimd::ssub(white2, black2);
white1 = xsimd::ssub(greenMask, white1);
white2 = xsimd::ssub(greenMask, white2);
// Producing the final black pixel in a register and storing
// that is actually faster than doing a masked store since that
// does an unaligned storage. We have the black pixel in a register
// anyway.
black1 = xsimd::bitwise_andnot(black1, alphaMask);
black2 = xsimd::bitwise_andnot(black2, alphaMask);
white1 = xsimd::slide_left<2>(white1);
white2 = xsimd::slide_left<2>(white2);
white1 &= alphaMask;
white2 &= alphaMask;
black1 |= white1;
black2 |= white2;
black1.store_aligned(blackData);
black2.store_aligned(blackData + batch_size);
blackData += 2 * batch_size;
whiteData += 2 * batch_size;
}
for (; j < size.width - 4; j += 4) {
auto black = batch_type::load_aligned(blackData);
auto white = batch_type::load_aligned(whiteData);
white = xsimd::ssub(white, black);
white = xsimd::ssub(greenMask, white);
black = xsimd::bitwise_andnot(black, alphaMask);
white = xsimd::slide_left<2>(white);
white &= alphaMask;
black |= white;
black.store_aligned(blackData);
blackData += batch_size;
whiteData += batch_size;
}
// Loop single pixels until we're done.
while (j < size.width) {
*((uint32_t*)blackData) =
RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
*reinterpret_cast<uint32_t*>(whiteData));
blackData += 4;
whiteData += 4;
j++;
}
blackData += blackSurf->Stride() - j * 4;
whiteData += whiteSurf->Stride() - j * 4;
}
blackSurf->MarkDirty();
return true;
}
template bool gfxAlphaRecovery::RecoverAlphaGeneric<xsimd::sse2>(
gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf);