diff --git a/gfx/wr/swgl/src/composite.h b/gfx/wr/swgl/src/composite.h index b7bb8ccfe725..be84dc4fe3f0 100644 --- a/gfx/wr/swgl/src/composite.h +++ b/gfx/wr/swgl/src/composite.h @@ -2,6 +2,29 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +// Load a partial span > 0 and < 4 pixels. +template +static ALWAYS_INLINE V partial_load_span(P* src, int span) { + return bit_cast( + (span >= 2 ? combine(unaligned_load>(src), + V2

{span > 2 ? unaligned_load

(src + 2) : 0, 0}) + : V4

{unaligned_load

(src), 0, 0, 0})); +} + +// Store a partial span > 0 and < 4 pixels. +template +static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) { + auto pixels = bit_cast>(src); + if (span >= 2) { + unaligned_store(dst, lowHalf(pixels)); + if (span > 2) { + unaligned_store(dst + 2, pixels.z); + } + } else { + unaligned_store(dst, pixels.x); + } +} + template static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth, int span) { @@ -96,9 +119,7 @@ static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV, } if (span > 0) { auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset); - auto mask = span_mask_RGBA8(span); - auto dstpx = unaligned_load(dest); - unaligned_store(dest, (mask & dstpx) | (~mask & srcpx)); + partial_store_span(dest, srcpx, span); } } @@ -114,9 +135,7 @@ static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV, } if (span > 0) { auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset); - auto mask = span_mask_R8(span); - auto dstpx = unpack(unaligned_load(dest)); - unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx))); + partial_store_span(dest, pack(srcpx), span); } } @@ -132,9 +151,7 @@ static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV, } if (span > 0) { auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv), srcZOffset); - auto mask = span_mask_RG8(span); - auto dstpx = unaligned_load(dest); - unaligned_store(dest, (mask & dstpx) | (~mask & srcpx)); + partial_store_span(dest, srcpx, span); } } @@ -215,12 +232,9 @@ static void linear_row_composite(uint32_t* dest, int span, } if (span > 0) { WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv), 0); - PackedRGBA8 dstpx = unaligned_load(dest); - WideRGBA8 dstpxu = unpack(dstpx); - PackedRGBA8 r = pack(srcpx + dstpxu - muldiv255(dstpxu, alphas(srcpx))); - - auto mask = span_mask_RGBA8(span); - unaligned_store(dest, (mask & dstpx) | (~mask & r)); + WideRGBA8 dstpx = unpack(partial_load_span(dest, span)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + partial_store_span(dest, r, span); } } @@ -367,11 +381,29 @@ void* GetResourceBuffer(LockedTexture* resource, int32_t* width, return resource->buf; } +static void unscaled_row_composite(uint32_t* dest, const uint32_t* src, + int span) { + const uint32_t* end = src + span; + while (src + 4 <= end) { + WideRGBA8 srcpx = unpack(unaligned_load(src)); + WideRGBA8 dstpx = unpack(unaligned_load(dest)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + unaligned_store(dest, r); + src += 4; + dest += 4; + } + if (src < end) { + WideRGBA8 srcpx = unpack(unaligned_load(src)); + WideRGBA8 dstpx = unpack(partial_load_span(dest, end - src)); + auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + partial_store_span(dest, r, end - src); + } +} + static NO_INLINE void unscaled_composite(Texture& srctex, const IntRect& srcReq, Texture& dsttex, const IntRect& dstReq, bool invertY, int bandOffset, int bandHeight) { - const int bpp = 4; IntRect bounds = dsttex.sample_bounds(dstReq, invertY); bounds.intersect(srctex.sample_bounds(srcReq)); char* dest = dsttex.sample_ptr(dstReq, bounds, 0, invertY); @@ -385,32 +417,10 @@ static NO_INLINE void unscaled_composite(Texture& srctex, const IntRect& srcReq, src += srcStride * bandOffset; for (int rows = min(bounds.height() - bandOffset, bandHeight); rows > 0; rows--) { - char* end = src + bounds.width() * bpp; - while (src + 4 * bpp <= end) { - WideRGBA8 srcpx = unpack(unaligned_load(src)); - WideRGBA8 dstpx = unpack(unaligned_load(dest)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dest, r); - src += 4 * bpp; - dest += 4 * bpp; - } - if (src < end) { - WideRGBA8 srcpx = unpack(unaligned_load(src)); - WideRGBA8 dstpx = unpack(unaligned_load(dest)); - U32 r = - bit_cast(pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)))); - unaligned_store(dest, r.x); - if (src + bpp < end) { - unaligned_store(dest + bpp, r.y); - if (src + 2 * bpp < end) { - unaligned_store(dest + 2 * bpp, r.z); - } - } - dest += end - src; - src = end; - } - dest += destStride - bounds.width() * bpp; - src += srcStride - bounds.width() * bpp; + unscaled_row_composite((uint32_t*)dest, (const uint32_t*)src, + bounds.width()); + dest += destStride; + src += srcStride; } } @@ -719,9 +729,7 @@ static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV, auto uvPx = textureLinearRowPairedR8(&sampler[1], &sampler[2], cU, cOffsetV, cStrideV, cFracV); auto srcpx = YUVConverter::convert(yPx, uvPx); - auto mask = span_mask_RGBA8(span); - auto dstpx = unaligned_load(dest); - unaligned_store(dest, (mask & dstpx) | (~mask & srcpx)); + partial_store_span(dest, srcpx, span); } } diff --git a/gfx/wr/swgl/src/gl.cc b/gfx/wr/swgl/src/gl.cc index 5842cfd2927d..7310eb4fceff 100644 --- a/gfx/wr/swgl/src/gl.cc +++ b/gfx/wr/swgl/src/gl.cc @@ -2723,10 +2723,10 @@ static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) { ivec4 i = round_pixel(v); HalfRGBA8 xz = packRGBA8(i.z, i.x); HalfRGBA8 yw = packRGBA8(i.y, i.w); - HalfRGBA8 xy = zipLow(xz, yw); - HalfRGBA8 zw = zipHigh(xz, yw); - HalfRGBA8 lo = zip2Low(xy, zw); - HalfRGBA8 hi = zip2High(xy, zw); + HalfRGBA8 xyzwl = zipLow(xz, yw); + HalfRGBA8 xyzwh = zipHigh(xz, yw); + HalfRGBA8 lo = zip2Low(xyzwl, xyzwh); + HalfRGBA8 hi = zip2High(xyzwl, xyzwh); return combine(lo, hi); } @@ -2916,7 +2916,7 @@ static inline WideR8 span_mask(uint8_t*, int span) { return span_mask_R8(span); } -static inline PackedRG8 span_mask_RG8(int span) { +UNUSED static inline PackedRG8 span_mask_RG8(int span) { return bit_cast(I16(span) < I16{1, 2, 3, 4}); } diff --git a/gfx/wr/swgl/src/vector_type.h b/gfx/wr/swgl/src/vector_type.h index 2ef42c6896ea..cdb10a7e6724 100644 --- a/gfx/wr/swgl/src/vector_type.h +++ b/gfx/wr/swgl/src/vector_type.h @@ -38,6 +38,16 @@ SI VectorType combine(VectorType a, VectorType b) { 13, 14, 15); } +template +SI VectorType lowHalf(VectorType a) { + return __builtin_shufflevector(a, a, 0, 1); +} + +template +SI VectorType highHalf(VectorType a) { + return __builtin_shufflevector(a, a, 2, 3); +} + template SI VectorType lowHalf(VectorType a) { return __builtin_shufflevector(a, a, 0, 1, 2, 3); @@ -343,6 +353,12 @@ struct VectorType { }; T elements[2]; }; + + SI VectorType wrap(const data_type& data) { + VectorType v; + v.data = data; + return v; + } }; # define CONVERT(vector, type) ((type)(vector))