Bug 1692731 - Accelerate YUV 422 compositing in SWGL. r=jrmuizel

For YUV 422 video, when we are sampling UV planes at half the resolution of the
Y plane, we can interpolate from 2 samples for the UV planes as an approximation
of the 4 samples, allowing us to better pack the math into SIMD vectors and
substantially reduce the number of multiplications.

Differential Revision: https://phabricator.services.mozilla.com/D105137
This commit is contained in:
Lee Salzman 2021-02-16 21:17:45 +00:00
parent 20f75fe5f0
commit ba5d757a8a
3 changed files with 197 additions and 15 deletions

View File

@ -1,5 +1,5 @@
skip-if(Android) fuzzy-if(OSX,0-80,0-76800) fuzzy-if(winWidget,0-62,0-76799) fuzzy-if(gtkWidget&&layersGPUAccelerated,0-70,0-600) HTTP(..) == short.mp4.firstframe.html short.mp4.firstframe-ref.html
skip-if(Android) fuzzy-if(OSX,0-87,0-76797) fuzzy-if(winWidget,0-60,0-76797) fuzzy-if(gtkWidget&&layersGPUAccelerated,0-60,0-1800) HTTP(..) == short.mp4.lastframe.html short.mp4.lastframe-ref.html
skip-if(Android) fuzzy-if(OSX,0-80,0-76800) fuzzy-if(winWidget,0-62,0-76799) fuzzy-if(gtkWidget&&layersGPUAccelerated,0-70,0-644) HTTP(..) == short.mp4.firstframe.html short.mp4.firstframe-ref.html
skip-if(Android) fuzzy-if(OSX,0-87,0-76797) fuzzy-if(winWidget,0-60,0-76797) fuzzy-if(gtkWidget&&layersGPUAccelerated,0-60,0-1810) HTTP(..) == short.mp4.lastframe.html short.mp4.lastframe-ref.html
skip-if(Android) skip-if(winWidget) fuzzy-if(gtkWidget&&layersGPUAccelerated,0-57,0-4281) fuzzy-if(OSX,55-80,4173-4417) HTTP(..) == bipbop_300_215kbps.mp4.lastframe.html bipbop_300_215kbps.mp4.lastframe-ref.html
skip-if(Android) fuzzy-if(OSX,0-25,0-175921) fuzzy-if(winWidget,0-71,0-179198) fuzzy-if((/^Windows\x20NT\x2010\.0/.test(http.oscpu))&&(/^aarch64-msvc/.test(xulRuntime.XPCOMABI)),0-255,0-179500) HTTP(..) == gizmo.mp4.seek.html gizmo.mp4.55thframe-ref.html
skip-if(Android) skip-if(MinGW) skip-if((/^Windows\x20NT\x2010\.0/.test(http.oscpu))&&(/^aarch64-msvc/.test(xulRuntime.XPCOMABI))) fuzzy(0-10,0-778236) == image-10bits-rendering-video.html image-10bits-rendering-ref.html

View File

@ -674,17 +674,143 @@ static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2,
return abcdxyzwl;
}
// Casting to int loses some precision while stepping that can offset the
// image, so shift the values by some extra bits of precision to minimize
// this. We support up to 16 bits of image size, 7 bits of quantization,
// and 1 bit for sign, which leaves 8 bits left for extra precision.
const int STEP_BITS = 8;
// Optimized version of textureLinearPackedR8 for Y R8 texture with
// half-resolution paired U/V R8 textures. This allows us to more efficiently
// pack YUV samples into vectors to substantially reduce math operations even
// further.
template <YUVColorSpace COLOR_SPACE>
static inline void upscaleYUV42R8(uint32_t* dest, int span,
sampler2D_impl sampler[3], I32 yU,
int32_t yDU, int32_t yOffsetV,
int32_t yStrideV, int16_t yFracV, I32 cU,
int32_t cDU, int32_t cOffsetV,
int32_t cStrideV, int16_t cFracV) {
// As much as possible try to utilize the fact that we're only using half
// the UV samples to combine Y and UV samples into single vectors. Here we
// need to initialize several useful vector quantities for stepping fractional
// offsets. For the UV samples, we take the average of the first+second and
// third+fourth samples in a chunk which conceptually correspond to offsets
// 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate
// samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into
// the top 7 bits of an unsigned short so that we can mask off the exact
// fractional bits we need to blend merely by right shifting them into
// position.
cU = (cU.xzxz + cU.ywyw) >> 1;
auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>)
<< (16 - (STEP_BITS + 7));
auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7));
auto ycFracV = combine(I16(yFracV), I16(cFracV));
I32 yI = yU >> (STEP_BITS + 7);
I32 cI = cU >> (STEP_BITS + 7);
uint8_t* yRow = (uint8_t*)sampler[0].buf + yOffsetV;
uint8_t* cRow1 = (uint8_t*)sampler[1].buf + cOffsetV;
uint8_t* cRow2 = (uint8_t*)sampler[2].buf + cOffsetV;
// Load initial combined YUV samples for each row and blend them.
auto ycSrc0 =
CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]),
combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]),
unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))),
V8<int16_t>);
auto ycSrc1 = CONVERT(
combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]),
combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]),
unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))),
V8<int16_t>);
auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7);
// Here we shift in results from the next sample while caching results from
// the previous sample. This allows us to reduce the multiplications in the
// inner loop down to only two since we just need to blend the new samples
// horizontally and then vertically once each.
for (uint32_t* end = dest + span; dest < end; dest += 4) {
yU += yDU;
I32 yIn = yU >> (STEP_BITS + 7);
cU += cDU;
I32 cIn = cU >> (STEP_BITS + 7);
// Load combined YUV samples for the next chunk on each row and blend them.
auto ycSrc0n =
CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]),
combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]),
unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))),
V8<int16_t>);
auto ycSrc1n = CONVERT(
combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]),
combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]),
unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))),
V8<int16_t>);
auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7);
// The source samples for the chunk may not match the actual tap offsets.
// Since we're upscaling, we know the tap offsets fall within all the
// samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar,
// instead we do laborious shuffling here for the Y samples and then the UV
// samples.
auto yshuf = lowHalf(ycSrc);
auto yshufn =
SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn),
1, 2, 3, 4);
if (yI.y == yI.x) {
yshuf = yshuf.xxyz;
yshufn = yshufn.xxyz;
}
if (yI.z == yI.y) {
yshuf = yshuf.xyyz;
yshufn = yshufn.xyyz;
}
if (yI.w == yI.z) {
yshuf = yshuf.xyzz;
yshufn = yshufn.xyzz;
}
auto cshuf = highHalf(ycSrc);
auto cshufn =
SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn),
1, 4, 3, 6);
if (cI.y == cI.x) {
cshuf = cshuf.xxzz;
cshufn = cshufn.xxzz;
}
// After shuffling, combine the Y and UV samples back into a single vector
// for blending. Shift X fraction into position as unsigned to mask off top
// bits and get rid of low bits to avoid multiplication overflow.
auto yuvPx = combine(yshuf, cshuf);
yuvPx += ((combine(yshufn, cshufn) - yuvPx) *
bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >>
7;
// Cache the new samples as the current samples on the next iteration.
ycSrc = ycSrcn;
ycFracX += ycFracDX;
yI = yIn;
cI = cIn;
// De-interleave the Y and UV results. We need to average the UV results
// to produce values for intermediate samples. Taps for UV were collected at
// offsets 0.5 and 1.5, such that if we take a quarter of the difference
// (1.5-0.5)/4, subtract it from even samples, and add it to odd samples,
// we can estimate samples 0.25, 0.75, 1.25, and 1.75.
auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3);
auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) +
((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) -
SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >>
2);
unaligned_store(dest, YUVConverter<COLOR_SPACE>::convert(yPx, uvPx));
}
}
template <YUVColorSpace COLOR_SPACE>
static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV,
float srcDU, const vec2_scalar& chromaUV,
float chromaDU, sampler2D_impl sampler[3],
int colorDepth) {
// Casting to int loses some precision while stepping that can offset the
// image, so shift the values by some extra bits of precision to minimize
// this. We support up to 16 bits of image size, 7 bits of quantization,
// and 1 bit for sign, which leaves 8 bits left for extra precision.
const int STEP_BITS = 8;
// Calculate varying and constant interp data for Y plane.
I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS));
int32_t yV = int32_t(srcUV.y);
@ -717,12 +843,13 @@ static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV,
partial_store_span(dest, rgb, span);
}
} else if (sampler[0].format == TextureFormat::R16) {
// Sample each YUV plane, rescale it to fit in low 8 bits of word, and then
// transform them by the appropriate color space.
// Sample each YUV plane, rescale it to fit in low 8 bits of word, and
// then transform them by the appropriate color space.
assert(colorDepth > 8);
// Need to right shift the sample by the amount of bits over 8 it occupies.
// On output from textureLinearUnpackedR16, we have lost 1 bit of precision
// at the low end already, hence 1 is subtracted from the color depth.
// Need to right shift the sample by the amount of bits over 8 it
// occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
// of precision at the low end already, hence 1 is subtracted from the
// color depth.
int rescaleBits = (colorDepth - 1) - 8;
for (; span >= 4; span -= 4) {
auto yPx =
@ -774,9 +901,45 @@ static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV,
int32_t cStrideV =
cV >= 0 && cV < int32_t(sampler[1].height) - 1 ? sampler[1].stride : 0;
// If we're sampling the UV planes at half the resolution of the Y plane,
// then try to use half resolution fast-path.
if (yDU >= cDU && yDU <= (4 << (STEP_BITS + 7)) &&
cDU <= (2 << (STEP_BITS + 7))) {
// Ensure that samples don't fall outside of the valid bounds of each
// planar texture. Step until the initial X coordinates are positive.
for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) {
auto yPx = textureLinearRowR8(&sampler[0], yU >> STEP_BITS, yOffsetV,
yStrideV, yFracV);
auto uvPx =
textureLinearRowPairedR8(&sampler[1], &sampler[2], cU >> STEP_BITS,
cOffsetV, cStrideV, cFracV);
unaligned_store(dest, YUVConverter<COLOR_SPACE>::convert(yPx, uvPx));
dest += 4;
yU += yDU;
cU += cDU;
}
// Calculate the number of aligned chunks that we can step inside the
// bounds of each planar texture without overreading.
int inside = min(
min((((int(sampler[0].width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU,
(((int(sampler[1].width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) *
4,
span & ~3);
if (inside > 0) {
upscaleYUV42R8<COLOR_SPACE>(dest, inside, sampler, yU, yDU, yOffsetV,
yStrideV, yFracV, cU, cDU, cOffsetV,
cStrideV, cFracV);
span -= inside;
dest += inside;
yU += (inside / 4) * yDU;
cU += (inside / 4) * cDU;
}
// If there are any remaining chunks that weren't inside, handle them
// below.
}
for (; span >= 4; span -= 4) {
// Sample each YUV plane and then transform them by the appropriate color
// space.
// Sample each YUV plane and then transform them by the appropriate
// color space.
auto yPx = textureLinearRowR8(&sampler[0], yU >> STEP_BITS, yOffsetV,
yStrideV, yFracV);
auto uvPx =

View File

@ -319,11 +319,20 @@ struct VectorType {
# define yyyy swizzle(1, 1, 1, 1)
# define zzzz swizzle(2, 2, 2, 2)
# define wwww swizzle(3, 3, 3, 3)
# define xxyy swizzle(0, 0, 1, 1)
# define xxzz swizzle(0, 0, 2, 2)
# define yyww swizzle(1, 1, 3, 3)
# define zzww swizzle(2, 2, 3, 3)
# define xyxy swizzle(0, 1, 0, 1)
# define xzxz swizzle(0, 2, 0, 2)
# define ywyw swizzle(1, 3, 1, 3)
# define zwzw swizzle(2, 3, 2, 3)
# define zwxy swizzle(2, 3, 0, 1)
# define zyxw swizzle(2, 1, 0, 3)
# define xxyz swizzle(0, 0, 1, 2)
# define xyyz swizzle(0, 1, 1, 2)
# define xyzz swizzle(0, 1, 2, 2)
# define xzyw swizzle(0, 2, 1, 3)
# define yzwx swizzle(1, 2, 3, 0)
# define wxyz swizzle(3, 0, 1, 2)
# define xxxxyyyy XXXXYYYY()
@ -407,6 +416,16 @@ SI VectorType<T, N * 4> combine(VectorType<T, N> a, VectorType<T, N> b,
return combine(combine(a, b), combine(c, d));
}
template <typename T, int N>
SI VectorType<T, N> combineLow(VectorType<T, N> a, VectorType<T, N> b) {
return combine(lowHalf(a), lowHalf(b));
}
template <typename T, int N>
SI VectorType<T, N> combineHigh(VectorType<T, N> a, VectorType<T, N> b) {
return combine(highHalf(a), highHalf(b));
}
template <typename T>
SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) {
return SHUFFLE(a, b, 0, 4, 1, 5);