diff --git a/gfx/wr/swgl/src/composite.h b/gfx/wr/swgl/src/composite.h
index b7bb8ccfe725..be84dc4fe3f0 100644
--- a/gfx/wr/swgl/src/composite.h
+++ b/gfx/wr/swgl/src/composite.h
@@ -2,6 +2,29 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+// Load a partial span > 0 and < 4 pixels.
+template <typename V, typename P>
+static ALWAYS_INLINE V partial_load_span(P* src, int span) {
+  return bit_cast<V>(
+      (span >= 2 ? combine(unaligned_load<V2<P>>(src),
+                           V2<P>{span > 2 ? unaligned_load<P>(src + 2) : 0, 0})
+                 : V4<P>{unaligned_load<P>(src), 0, 0, 0}));
+}
+
+// Store a partial span > 0 and < 4 pixels.
+template <typename V, typename P>
+static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) {
+  auto pixels = bit_cast<V4<P>>(src);
+  if (span >= 2) {
+    unaligned_store(dst, lowHalf(pixels));
+    if (span > 2) {
+      unaligned_store(dst + 2, pixels.z);
+    }
+  } else {
+    unaligned_store(dst, pixels.x);
+  }
+}
+
 template <typename P>
 static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
                              int span) {
@@ -96,9 +119,7 @@ static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV,
   }
   if (span > 0) {
     auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
-    auto mask = span_mask_RGBA8(span);
-    auto dstpx = unaligned_load<PackedRGBA8>(dest);
-    unaligned_store(dest, (mask & dstpx) | (~mask & srcpx));
+    partial_store_span(dest, srcpx, span);
   }
 }
 
@@ -114,9 +135,7 @@ static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV,
   }
   if (span > 0) {
     auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
-    auto mask = span_mask_R8(span);
-    auto dstpx = unpack(unaligned_load<PackedR8>(dest));
-    unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx)));
+    partial_store_span(dest, pack(srcpx), span);
   }
 }
 
@@ -132,9 +151,7 @@ static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV,
   }
   if (span > 0) {
     auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv), srcZOffset);
-    auto mask = span_mask_RG8(span);
-    auto dstpx = unaligned_load<PackedRG8>(dest);
-    unaligned_store(dest, (mask & dstpx) | (~mask & srcpx));
+    partial_store_span(dest, srcpx, span);
   }
 }
 
@@ -215,12 +232,9 @@ static void linear_row_composite(uint32_t* dest, int span,
   }
   if (span > 0) {
     WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv), 0);
-    PackedRGBA8 dstpx = unaligned_load<PackedRGBA8>(dest);
-    WideRGBA8 dstpxu = unpack(dstpx);
-    PackedRGBA8 r = pack(srcpx + dstpxu - muldiv255(dstpxu, alphas(srcpx)));
-
-    auto mask = span_mask_RGBA8(span);
-    unaligned_store(dest, (mask & dstpx) | (~mask & r));
+    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span));
+    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    partial_store_span(dest, r, span);
   }
 }
 
@@ -367,11 +381,29 @@ void* GetResourceBuffer(LockedTexture* resource, int32_t* width,
   return resource->buf;
 }
 
+static void unscaled_row_composite(uint32_t* dest, const uint32_t* src,
+                                   int span) {
+  const uint32_t* end = src + span;
+  while (src + 4 <= end) {
+    WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    unaligned_store(dest, r);
+    src += 4;
+    dest += 4;
+  }
+  if (src < end) {
+    WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, end - src));
+    auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+    partial_store_span(dest, r, end - src);
+  }
+}
+
 static NO_INLINE void unscaled_composite(Texture& srctex, const IntRect& srcReq,
                                          Texture& dsttex, const IntRect& dstReq,
                                          bool invertY, int bandOffset,
                                          int bandHeight) {
-  const int bpp = 4;
   IntRect bounds = dsttex.sample_bounds(dstReq, invertY);
   bounds.intersect(srctex.sample_bounds(srcReq));
   char* dest = dsttex.sample_ptr(dstReq, bounds, 0, invertY);
@@ -385,32 +417,10 @@ static NO_INLINE void unscaled_composite(Texture& srctex, const IntRect& srcReq,
   src += srcStride * bandOffset;
   for (int rows = min(bounds.height() - bandOffset, bandHeight); rows > 0;
        rows--) {
-    char* end = src + bounds.width() * bpp;
-    while (src + 4 * bpp <= end) {
-      WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
-      WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
-      PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-      unaligned_store(dest, r);
-      src += 4 * bpp;
-      dest += 4 * bpp;
-    }
-    if (src < end) {
-      WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
-      WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
-      U32 r =
-          bit_cast<U32>(pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))));
-      unaligned_store(dest, r.x);
-      if (src + bpp < end) {
-        unaligned_store(dest + bpp, r.y);
-        if (src + 2 * bpp < end) {
-          unaligned_store(dest + 2 * bpp, r.z);
-        }
-      }
-      dest += end - src;
-      src = end;
-    }
-    dest += destStride - bounds.width() * bpp;
-    src += srcStride - bounds.width() * bpp;
+    unscaled_row_composite((uint32_t*)dest, (const uint32_t*)src,
+                           bounds.width());
+    dest += destStride;
+    src += srcStride;
   }
 }
 
@@ -719,9 +729,7 @@ static void linear_row_yuv(uint32_t* dest, int span, const vec2_scalar& srcUV,
     auto uvPx = textureLinearRowPairedR8(&sampler[1], &sampler[2], cU, cOffsetV,
                                          cStrideV, cFracV);
     auto srcpx = YUVConverter<COLOR_SPACE>::convert(yPx, uvPx);
-    auto mask = span_mask_RGBA8(span);
-    auto dstpx = unaligned_load<PackedRGBA8>(dest);
-    unaligned_store(dest, (mask & dstpx) | (~mask & srcpx));
+    partial_store_span(dest, srcpx, span);
   }
 }
 
diff --git a/gfx/wr/swgl/src/gl.cc b/gfx/wr/swgl/src/gl.cc
index 5842cfd2927d..7310eb4fceff 100644
--- a/gfx/wr/swgl/src/gl.cc
+++ b/gfx/wr/swgl/src/gl.cc
@@ -2723,10 +2723,10 @@ static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) {
   ivec4 i = round_pixel(v);
   HalfRGBA8 xz = packRGBA8(i.z, i.x);
   HalfRGBA8 yw = packRGBA8(i.y, i.w);
-  HalfRGBA8 xy = zipLow(xz, yw);
-  HalfRGBA8 zw = zipHigh(xz, yw);
-  HalfRGBA8 lo = zip2Low(xy, zw);
-  HalfRGBA8 hi = zip2High(xy, zw);
+  HalfRGBA8 xyzwl = zipLow(xz, yw);
+  HalfRGBA8 xyzwh = zipHigh(xz, yw);
+  HalfRGBA8 lo = zip2Low(xyzwl, xyzwh);
+  HalfRGBA8 hi = zip2High(xyzwl, xyzwh);
   return combine(lo, hi);
 }
 
@@ -2916,7 +2916,7 @@ static inline WideR8 span_mask(uint8_t*, int span) {
   return span_mask_R8(span);
 }
 
-static inline PackedRG8 span_mask_RG8(int span) {
+UNUSED static inline PackedRG8 span_mask_RG8(int span) {
   return bit_cast<PackedRG8>(I16(span) < I16{1, 2, 3, 4});
 }
 
diff --git a/gfx/wr/swgl/src/vector_type.h b/gfx/wr/swgl/src/vector_type.h
index 2ef42c6896ea..cdb10a7e6724 100644
--- a/gfx/wr/swgl/src/vector_type.h
+++ b/gfx/wr/swgl/src/vector_type.h
@@ -38,6 +38,16 @@ SI VectorType<T, 16> combine(VectorType<T, 8> a, VectorType<T, 8> b) {
                                  13, 14, 15);
 }
 
+template <typename T>
+SI VectorType<T, 2> lowHalf(VectorType<T, 4> a) {
+  return __builtin_shufflevector(a, a, 0, 1);
+}
+
+template <typename T>
+SI VectorType<T, 2> highHalf(VectorType<T, 4> a) {
+  return __builtin_shufflevector(a, a, 2, 3);
+}
+
 template <typename T>
 SI VectorType<T, 4> lowHalf(VectorType<T, 8> a) {
   return __builtin_shufflevector(a, a, 0, 1, 2, 3);
@@ -343,6 +353,12 @@ struct VectorType<T, 2> {
     };
     T elements[2];
   };
+
+  SI VectorType wrap(const data_type& data) {
+    VectorType v;
+    v.data = data;
+    return v;
+  }
 };
 
 #  define CONVERT(vector, type) ((type)(vector))