Bug 1601622 - Implement NEON accelerated methods for unpacking RGB to RGBA/BGRA. r=lsalzman

These new methods will be automatically used by ARM targets for image
decoding. Specifically it should reduce the time required to decode GIFs
and opaque PNGs.

Differential Revision: https://phabricator.services.mozilla.com/D56030

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Andrew Osmond 2019-12-06 19:06:00 +00:00
parent e8c12c5adf
commit 49ee6bff30
2 changed files with 75 additions and 0 deletions

View File

@ -214,6 +214,14 @@ void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t);
aSrcFormat, aDstFormat, \
SwizzleRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
ShouldForceOpaque(aSrcFormat, aDstFormat)>)
template <bool aSwapRB>
void UnpackRowRGB24_NEON(const uint8_t*, uint8_t*, int32_t);
# define UNPACK_ROW_RGB_NEON(aDstFormat) \
FORMAT_CASE_ROW( \
SurfaceFormat::R8G8B8, aDstFormat, \
UnpackRowRGB24_NEON<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
#endif
/**
@ -1036,6 +1044,10 @@ SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
#ifdef USE_NEON
if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
UNPACK_ROW_RGB_NEON(SurfaceFormat::R8G8B8X8)
UNPACK_ROW_RGB_NEON(SurfaceFormat::R8G8B8A8)
UNPACK_ROW_RGB_NEON(SurfaceFormat::B8G8R8X8)
UNPACK_ROW_RGB_NEON(SurfaceFormat::B8G8R8A8)
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)

View File

@ -367,5 +367,68 @@ template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
int32_t, IntSize);
template <bool aSwapRB>
void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
template <bool aSwapRB>
void UnpackRowRGB24_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
// Because this implementation will read an additional 4 bytes of data that
// is ignored and masked over, we cannot use the accelerated version for the
// last 1-5 pixels (3-15 bytes remaining) to guarantee we don't access memory
// outside the buffer (we read in 16 byte chunks).
if (aLength < 6) {
UnpackRowRGB24<aSwapRB>(aSrc, aDst, aLength);
return;
}
// Because we are expanding, we can only process the data back to front in
// case we are performing this in place.
int32_t alignedRow = (aLength - 2) & ~3;
int32_t remainder = aLength - alignedRow;
const uint8_t* src = aSrc + alignedRow * 3;
uint8_t* dst = aDst + alignedRow * 4;
// Handle 2-5 remaining pixels.
UnpackRowRGB24<aSwapRB>(src, dst, remainder);
uint8x8_t masklo;
uint8x8_t maskhi;
if (aSwapRB) {
static const uint8_t masklo_data[] = {2, 1, 0, 0, 5, 4, 3, 0};
static const uint8_t maskhi_data[] = {4, 3, 2, 0, 7, 6, 5, 0};
masklo = vld1_u8(masklo_data);
maskhi = vld1_u8(maskhi_data);
} else {
static const uint8_t masklo_data[] = {0, 1, 2, 0, 3, 4, 5, 0};
static const uint8_t maskhi_data[] = {2, 3, 4, 0, 5, 6, 7, 0};
masklo = vld1_u8(masklo_data);
maskhi = vld1_u8(maskhi_data);
}
uint8x16_t alpha = vreinterpretq_u8_u32(vdupq_n_u32(0xFF000000));
// Process all 4-pixel chunks as one vector.
src -= 4 * 3;
dst -= 4 * 4;
while (src >= aSrc) {
uint8x16_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(src));
// G2R2B1G1 R1B0G0R0 -> X1R1G1B1 X0R0G0B0
uint8x8_t pxlo = vtbl1_u8(vget_low_u8(px), masklo);
// B3G3R3B2 G2R2B1G1 -> X3R3G3B3 X2R2G2B2
uint8x8_t pxhi =
vtbl1_u8(vext_u8(vget_low_u8(px), vget_high_u8(px), 4), maskhi);
px = vcombine_u8(pxlo, pxhi);
px = vorrq_u8(px, alpha);
vst1q_u16(reinterpret_cast<uint16_t*>(dst), px);
src -= 4 * 3;
dst -= 4 * 4;
}
}
// Force instantiation of swizzle variants here.
template void UnpackRowRGB24_NEON<false>(const uint8_t*, uint8_t*, int32_t);
template void UnpackRowRGB24_NEON<true>(const uint8_t*, uint8_t*, int32_t);
} // namespace gfx
} // namespace mozilla