Bug 509052: Add new, faster blurring code. r=derf

2024-10-08 10:44:56 +00:00 · 2012-11-07 09:29:54 +01:00 · 2012-11-07 09:29:54 +01:00 · cbb586e21a
commit cbb586e21a
parent 923d906d77
7 changed files with 567 additions and 45 deletions
--- a/gfx/2d/Blur.cpp
+++ b/gfx/2d/Blur.cpp
@ -12,6 +12,9 @@
 #include "mozilla/Constants.h"
 #include "mozilla/Util.h"

+#include "2D.h"
+#include "Tools.h"
+
 using namespace std;

 namespace mozilla {
@ -311,8 +314,8 @@ SpreadVertical(unsigned char* aInput,
    }
 }

-static CheckedInt<int32_t>
-RoundUpToMultipleOf4(int32_t aVal)
+CheckedInt<int32_t>
+AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal)
 {
  CheckedInt<int32_t> val(aVal);

@ -378,10 +381,9 @@ AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect,
  if (stride.isValid()) {
    mStride = stride.value();

-    CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height *
-                               sizeof(unsigned char);
+    CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height;
    if (size.isValid()) {
-      mData = static_cast<unsigned char*>(malloc(size.value()));
+      mData = new uint8_t[size.value()];
      memset(mData, 0, size.value());
    }
  }
@ -405,7 +407,7 @@ AlphaBoxBlur::AlphaBoxBlur(uint8_t* aData,
 AlphaBoxBlur::~AlphaBoxBlur()
 {
  if (mFreeData) {
-    free(mData);
+    delete [] mData;
  }
 }

@ -455,42 +457,236 @@ AlphaBoxBlur::Blur()
  if (mBlurRadius != IntSize(0,0) || mSpreadRadius != IntSize(0,0)) {
    int32_t stride = GetStride();

-    // No need to use CheckedInt here - we have validated it in the constructor.
-    size_t szB = stride * GetSize().height * sizeof(unsigned char);
-    unsigned char* tmpData = static_cast<unsigned char*>(malloc(szB));
-    if (!tmpData)
-      return; // OOM
-
-    memset(tmpData, 0, szB);
+    IntSize size = GetSize();

    if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
+      // No need to use CheckedInt here - we have validated it in the constructor.
+      size_t szB = stride * size.height;
+      unsigned char* tmpData = new uint8_t[szB];
+
+      memset(tmpData, 0, szB);
+
      SpreadHorizontal(mData, tmpData, mSpreadRadius.width, GetSize().width, GetSize().height, stride, mSkipRect);
      SpreadVertical(tmpData, mData, mSpreadRadius.height, GetSize().width, GetSize().height, stride, mSkipRect);
+
+      delete [] tmpData;
    }

-    if (mBlurRadius.width > 0) {
-      int32_t lobes[3][2];
-      ComputeLobes(mBlurRadius.width, lobes);
-      BoxBlurHorizontal(mData, tmpData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
-      BoxBlurHorizontal(tmpData, mData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
-      BoxBlurHorizontal(mData, tmpData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
+    int32_t horizontalLobes[3][2];
+    ComputeLobes(mBlurRadius.width, horizontalLobes);
+    int32_t verticalLobes[3][2];
+    ComputeLobes(mBlurRadius.height, verticalLobes);
+
+    // We want to allow for some extra space on the left for alignment reasons.
+    int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
+
+    IntSize integralImageSize(size.width + maxLeftLobe + horizontalLobes[1][1],
+                              size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
+
+#ifdef IS_BIG_ENDIAN
+    const bool cIsBigEndian = true;
+#else
+    const bool cIsBigEndian = false;
+#endif
+
+    if (cIsBigEndian || (integralImageSize.width * integralImageSize.height) > (1 << 24)) {
+      // Fallback to old blurring code when the surface is so large it may
+      // overflow our integral image!
+
+      // No need to use CheckedInt here - we have validated it in the constructor.
+      size_t szB = stride * size.height;
+      unsigned char* tmpData = new uint8_t[szB];
+
+      memset(tmpData, 0, szB);
+
+      if (mBlurRadius.width > 0) {
+        BoxBlurHorizontal(mData, tmpData, horizontalLobes[0][0], horizontalLobes[0][1], stride, GetSize().height, mSkipRect);
+        BoxBlurHorizontal(tmpData, mData, horizontalLobes[1][0], horizontalLobes[1][1], stride, GetSize().height, mSkipRect);
+        BoxBlurHorizontal(mData, tmpData, horizontalLobes[2][0], horizontalLobes[2][1], stride, GetSize().height, mSkipRect);
+      } else {
+        uint8_t *tmp = mData;
+        mData = tmpData;
+        tmpData = tmp;
+      }
+      if (mBlurRadius.height > 0) {
+        BoxBlurVertical(tmpData, mData, verticalLobes[0][0], verticalLobes[0][1], stride, GetSize().height, mSkipRect);
+        BoxBlurVertical(mData, tmpData, verticalLobes[1][0], verticalLobes[1][1], stride, GetSize().height, mSkipRect);
+        BoxBlurVertical(tmpData, mData, verticalLobes[2][0], verticalLobes[2][1], stride, GetSize().height, mSkipRect);
+      } else {
+        uint8_t *tmp = mData;
+        mData = tmpData;
+        tmpData = tmp;
+      }
+
+      delete [] tmpData;
    } else {
-      memcpy(tmpData, mData, stride * GetSize().height);
-    }
+      size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width * 4);

-    if (mBlurRadius.height > 0) {
-      int32_t lobes[3][2];
-      ComputeLobes(mBlurRadius.height, lobes);
-      BoxBlurVertical(tmpData, mData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
-      BoxBlurVertical(mData, tmpData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
-      BoxBlurVertical(tmpData, mData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
-    } else {
-      memcpy(mData, tmpData, stride * GetSize().height);
-    }
+      AlignedArray<uint32_t> integralImage((integralImageStride / 4) * integralImageSize.height);

-    free(tmpData);
+#ifdef USE_SSE2
+      if (Factory::HasSSE2()) {
+        BoxBlur_SSE2(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
+                     verticalLobes[0][1], integralImage, integralImageStride);
+        BoxBlur_SSE2(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
+                     verticalLobes[1][1], integralImage, integralImageStride);
+        BoxBlur_SSE2(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
+                     verticalLobes[2][1], integralImage, integralImageStride);
+      } else
+#endif
+      {
+        BoxBlur_C(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
+                  verticalLobes[0][1], integralImage, integralImageStride);
+        BoxBlur_C(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
+                  verticalLobes[1][1], integralImage, integralImageStride);
+        BoxBlur_C(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
+                  verticalLobes[2][1], integralImage, integralImageStride);
+      }
+    }
+  }
+}
+
+MOZ_ALWAYS_INLINE void
+GenerateIntegralRow(uint32_t  *aDest, const uint8_t *aSource, uint32_t *aPreviousRow,
+                    const uint32_t &aSourceWidth, const uint32_t &aLeftInflation, const uint32_t &aRightInflation)
+{
+  uint32_t currentRowSum = 0;
+  uint32_t pixel = aSource[0];
+  for (uint32_t x = 0; x < aLeftInflation; x++) {
+    currentRowSum += pixel;
+    *aDest++ = currentRowSum + *aPreviousRow++;
+  }
+  for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) {
+      uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+      alphaValues >>= 8;
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+      alphaValues >>= 8;
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+      alphaValues >>= 8;
+      currentRowSum += alphaValues & 0xff;
+      *aDest++ = *aPreviousRow++ + currentRowSum;
+  }
+  pixel = aSource[aSourceWidth - 1];
+  for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
+    currentRowSum += pixel;
+    *aDest++ = currentRowSum + *aPreviousRow++;
+  }
+}
+
+MOZ_ALWAYS_INLINE void
+GenerateIntegralImage_C(int32_t aLeftInflation, int32_t aRightInflation,
+                        int32_t aTopInflation, int32_t aBottomInflation,
+                        uint32_t *aIntegralImage, size_t aIntegralImageStride,
+                        uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
+{
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
+                            aSize.height + aTopInflation + aBottomInflation);
+
+  memset(aIntegralImage, 0, aIntegralImageStride);
+
+  GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage,
+                      aSize.width, aLeftInflation, aRightInflation);
+  for (int y = 1; y < aTopInflation + 1; y++) {
+    uint32_t *intRow = aIntegralImage + (y * stride32bit);
+    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
+    uint32_t *intFirstRow = aIntegralImage;
+
+    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit,
+                        aSize.width, aLeftInflation, aRightInflation);
  }

+  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
+    GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation),
+                        aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation);
+  }
+
+  if (aBottomInflation) {
+    for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) {
+      GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride),
+                          aIntegralImage + (y - 1) * stride32bit,
+                          aSize.width, aLeftInflation, aRightInflation);
+    }
+  }
+}
+
+/**
+ * Attempt to do an in-place box blur using an integral image.
+ */
+void
+AlphaBoxBlur::BoxBlur_C(int32_t aLeftLobe,
+                        int32_t aRightLobe,
+                        int32_t aTopLobe,
+                        int32_t aBottomLobe,
+                        uint32_t *aIntegralImage,
+                        size_t aIntegralImageStride)
+{
+  IntSize size = GetSize();
+
+  MOZ_ASSERT(size.width > 0);
+
+  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
+  // looking at an integral image the value of a pixel at 'x,y' is calculated
+  // using the value of the integral image values above/below that.
+  aLeftLobe++;
+  aTopLobe++;
+  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
+
+  MOZ_ASSERT(boxSize > 0);
+
+  if (boxSize == 1) {
+      return;
+  }
+
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
+
+  GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
+                          aIntegralImage, aIntegralImageStride, mData,
+                          mStride, size);
+
+  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
+
+  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
+
+  // Storing these locally makes this about 30% faster! Presumably the compiler
+  // can't be sure we're not altering the member variables in this loop.
+  IntRect skipRect = mSkipRect;
+  uint8_t *data = mData;
+  int32_t stride = mStride;
+  for (int32_t y = 0; y < size.height; y++) {
+    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
+
+    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
+    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
+    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
+    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
+
+    for (int32_t x = 0; x < size.width; x++) {
+      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
+        x = skipRect.XMost() - 1;
+        // Trigger early jump on coming loop iterations, this will be reset
+        // next line anyway.
+        inSkipRectY = false;
+        continue;
+      }
+      int32_t topLeft = topLeftBase[x];
+      int32_t topRight = topRightBase[x];
+      int32_t bottomRight = bottomRightBase[x];
+      int32_t bottomLeft = bottomLeftBase[x];
+
+      uint32_t value = bottomRight - topRight - bottomLeft;
+      value += topLeft;
+
+      data[stride * y + x] = (uint64_t(reciprocal) * value) >> 32;
+    }
+  }
 }

 /**
--- a/gfx/2d/Blur.h
+++ b/gfx/2d/Blur.h
@ -7,6 +7,7 @@

 #include "mozilla/gfx/Rect.h"
 #include "mozilla/gfx/Point.h"
+#include "mozilla/CheckedInt.h"

 namespace mozilla {
 namespace gfx {
@ -114,6 +115,13 @@ public:

 private:

+  void BoxBlur_C(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
+                 int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
+  void BoxBlur_SSE2(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
+                    int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
+
+  static CheckedInt<int32_t> RoundUpToMultipleOf4(int32_t aVal);
+
  /**
   * A rect indicating the area where blurring is unnecessary, and the blur
   * algorithm should skip over it.
--- a/gfx/2d/BlurSSE2.cpp
+++ b/gfx/2d/BlurSSE2.cpp
@ -0,0 +1,250 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "Blur.h"
+
+#include "SSEHelpers.h"
+
+#include <string.h>
+
+namespace mozilla {
+namespace gfx {
+
+MOZ_ALWAYS_INLINE
+uint32_t DivideAndPack(__m128i aValues, __m128i aDivisor, __m128i aMask)
+{
+  __m128i multiplied = _mm_srli_epi64(_mm_mul_epu32(aValues, aDivisor), 32); // 00p300p1
+  multiplied = _mm_or_si128(multiplied, _mm_and_si128(_mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor),
+    aMask)); // p4p3p2p1
+  __m128i final = _mm_packus_epi16(_mm_packs_epi32(multiplied, _mm_setzero_si128()), _mm_setzero_si128());
+
+  return _mm_cvtsi128_si32(final);
+}
+
+MOZ_ALWAYS_INLINE
+void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
+                            int32_t aSourceWidth, int32_t aLeftInflation,
+                            int32_t aRightInflation)
+{
+  int32_t currentRowSum = 0;
+
+  for (int x = 0; x < aLeftInflation; x++) {
+    currentRowSum += aSource[0];
+    aDest[x] = currentRowSum;
+  }
+  for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
+    currentRowSum += aSource[(x - aLeftInflation)];
+    aDest[x] = currentRowSum;
+  }
+  for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
+    currentRowSum += aSource[aSourceWidth - 1];
+    aDest[x] = currentRowSum;
+  }
+}
+
+// This function calculates an integral of four pixels stored in the 4
+// 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
+// { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
+// much testing.
+MOZ_ALWAYS_INLINE
+__m128i AccumulatePixelSums(__m128i aPixels)
+{
+  __m128i sumPixels = aPixels;
+  __m128i currentPixels = _mm_slli_si128(aPixels, 4);
+  sumPixels = _mm_add_epi32(sumPixels, currentPixels);
+  currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
+
+  return _mm_add_epi32(sumPixels, currentPixels);
+}
+
+MOZ_ALWAYS_INLINE void
+GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
+                           int32_t aTopInflation, int32_t aBottomInflation,
+                           uint32_t *aIntegralImage, size_t aIntegralImageStride,
+                           uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
+{
+  MOZ_ASSERT(!(aLeftInflation & 3));
+
+  uint32_t stride32bit = aIntegralImageStride / 4;
+
+  IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
+                            aSize.height + aTopInflation + aBottomInflation);
+
+  LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
+
+  for (int y = 1; y < aTopInflation + 1; y++) {
+    uint32_t *intRow = aIntegralImage + (y * stride32bit);
+    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
+    uint32_t *intFirstRow = aIntegralImage;
+
+    for (int x = 0; x < integralImageSize.width; x += 4) {
+      __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
+      __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
+    }
+  }
+
+  for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
+    __m128i currentRowSum = _mm_setzero_si128();
+    uint32_t *intRow = aIntegralImage + (y * stride32bit);
+    uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
+    uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
+
+    uint32_t pixel = sourceRow[0];
+    for (int x = 0; x < aLeftInflation; x += 4) {
+      __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
+
+      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
+
+      currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
+
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
+    }
+    for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
+      uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
+
+      // It's important to shuffle here. When we exit this loop currentRowSum
+      // has to be set to sumPixels, so that the following loop can get the
+      // correct pixel for the currentRowSum. The highest order pixel in
+      // currentRowSum could've originated from accumulation in the stride.
+      currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
+
+      __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
+      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
+
+      currentRowSum = sumPixels;
+
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
+    }
+
+    pixel = sourceRow[aSize.width - 1];
+    int x = (aSize.width + aLeftInflation);
+    if ((aSize.width & 3)) {
+      // Deal with unaligned portion. Get the correct pixel from currentRowSum,
+      // see explanation above.
+      uint32_t intCurrentRowSum = ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
+      for (; x < integralImageSize.width; x++) {
+        // We could be unaligned here!
+        if (!(x & 3)) {
+          // aligned!
+          currentRowSum = _mm_set1_epi32(intCurrentRowSum);
+          break;
+        }
+        intCurrentRowSum += pixel;
+        intRow[x] = intPrevRow[x] + intCurrentRowSum;
+      }
+    } else {
+      currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
+    }
+    for (; x < integralImageSize.width; x += 4) {
+      __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
+
+      sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
+
+      currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
+
+      _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
+    }
+  }
+
+  if (aBottomInflation) {
+    // Store the last valid row of our source image in the last row of
+    // our integral image. This will be overwritten with the correct values
+    // in the upcoming loop.
+    LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
+                           aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
+
+
+    for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
+      __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
+      __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
+      __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
+
+      for (int x = 0; x < integralImageSize.width; x += 4) {
+        _mm_store_si128(intRow + (x / 4),
+                        _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
+                                      _mm_load_si128(intPrevRow + (x / 4))));
+      }
+    }
+  }
+}
+
+/**
+ * Attempt to do an in-place box blur using an integral image.
+ */
+void
+AlphaBoxBlur::BoxBlur_SSE2(int32_t aLeftLobe,
+                           int32_t aRightLobe,
+                           int32_t aTopLobe,
+                           int32_t aBottomLobe,
+                           uint32_t *aIntegralImage,
+                           size_t aIntegralImageStride)
+{
+  IntSize size = GetSize();
+
+  MOZ_ASSERT(size.height > 0);
+
+  // Our 'left' or 'top' lobe will include the current pixel. i.e. when
+  // looking at an integral image the value of a pixel at 'x,y' is calculated
+  // using the value of the integral image values above/below that.
+  aLeftLobe++;
+  aTopLobe++;
+  int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
+
+  MOZ_ASSERT(boxSize > 0);
+
+  if (boxSize == 1) {
+      return;
+  }
+
+  uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
+
+  uint32_t stride32bit = aIntegralImageStride / 4;
+  int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
+
+  GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
+                             aIntegralImage, aIntegralImageStride, mData,
+                             mStride, size);
+
+  __m128i divisor = _mm_set1_epi32(reciprocal);
+  __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
+
+  // This points to the start of the rectangle within the IntegralImage that overlaps
+  // the surface being blurred.
+  uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
+
+  IntRect skipRect = mSkipRect;
+  int32_t stride = mStride;
+  uint8_t *data = mData;
+  for (int32_t y = 0; y < size.height; y++) {
+    bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
+
+    uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
+    uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
+    uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
+    uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
+
+    for (int32_t x = 0; x < size.width; x += 4) {
+      if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
+        x = skipRect.XMost() - 4;
+        // Trigger early jump on coming loop iterations, this will be reset
+        // next line anyway.
+        inSkipRectY = false;
+        continue;
+      }
+      __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
+      __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
+      __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
+      __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
+
+      __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(bottomRight, topRight), bottomLeft), topLeft);
+
+      *(uint32_t*)(data + stride * y + x) = DivideAndPack(values, divisor, mask);
+    }
+  }
+
+}
+
+}
+}
--- a/gfx/2d/ImageScalingSSE2.cpp
+++ b/gfx/2d/ImageScalingSSE2.cpp
@ -6,8 +6,7 @@
 #include "ImageScaling.h"
 #include "mozilla/Attributes.h"

-#include <xmmintrin.h>
-#include <emmintrin.h>
+#include "SSEHelpers.h"

 /* The functions below use the following system for averaging 4 pixels:
 *
@ -108,17 +107,6 @@ MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 }

-/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
- * faster. Once enough people are on architectures where _mm_loadu_si128 is
- * fast we can migrate to it.
- */
-MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
-{
-  // Yes! We use uninitialized memory here, we'll overwrite it though!
-  __m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
-  return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
-}
-
 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 {
  uint32_t sum = a ^ b ^ c;
--- a/gfx/2d/Makefile.in
+++ b/gfx/2d/Makefile.in
@ -116,7 +116,10 @@ endif
 ifneq (,$(INTEL_ARCHITECTURE))
 # VC2005 doesn't support _mm_castsi128_ps, so SSE2 is turned off
 ifneq (1400,$(_MSC_VER))
-CPPSRCS += ImageScalingSSE2.cpp
+CPPSRCS += \
+        ImageScalingSSE2.cpp \
+        BlurSSE2.cpp \
+        $(NULL)
 DEFINES += -DUSE_SSE2
 endif
 endif
@ -161,10 +164,12 @@ DEFINES := $(filter-out -DUNICODE -D_UNICODE,$(DEFINES))
 ifneq (,$(INTEL_ARCHITECTURE))
 ifdef GNU_CC
 ImageScalingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
+BlurSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
 endif

 ifdef SOLARIS_SUNPRO_CXX
 ImageScalingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
+BlurSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
 endif
 endif

--- a/gfx/2d/SSEHelpers.h
+++ b/gfx/2d/SSEHelpers.h
@ -0,0 +1,17 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
+ * faster. Once enough people are on architectures where _mm_loadu_si128 is
+ * fast we can migrate to it.
+ */
+MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
+{
+  // Yes! We use uninitialized memory here, we'll overwrite it though!
+  __m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
+  return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
+}
--- a/gfx/2d/Tools.h
+++ b/gfx/2d/Tools.h
@ -81,6 +81,64 @@ BytesPerPixel(SurfaceFormat aFormat)
  }
 }

+template<typename T, int alignment = 16>
+struct AlignedArray
+{
+  AlignedArray()
+    : mStorage(nullptr)
+    , mPtr(nullptr)
+  {
+  }
+
+  MOZ_ALWAYS_INLINE AlignedArray(size_t aSize)
+    : mStorage(nullptr)
+  {
+    Realloc(aSize);
+  }
+
+  MOZ_ALWAYS_INLINE ~AlignedArray()
+  {
+    delete [] mStorage;
+  }
+
+  void Dealloc()
+  {
+    delete [] mStorage;
+    mStorage = mPtr = nullptr;
+  }
+
+  MOZ_ALWAYS_INLINE void Realloc(size_t aSize)
+  {
+    delete [] mStorage;
+    mStorage = new T[aSize + (alignment - 1)];
+    if (uintptr_t(mStorage) % alignment) {
+      // Our storage does not start at a <alignment>-byte boundary. Make sure mData does!
+      mPtr = (uint32_t*)(uintptr_t(mStorage) +
+        (alignment - (uintptr_t(mStorage) % alignment)));
+    } else {
+      mPtr = mStorage;
+    }
+  }
+
+  MOZ_ALWAYS_INLINE operator T*()
+  {
+    return mPtr;
+  }
+
+  T *mStorage;
+  T *mPtr;
+};
+
+template<int alignment>
+int32_t GetAlignedStride(int32_t aStride)
+{
+  if (aStride % alignment) {
+    return aStride + (alignment - (aStride % alignment));
+  }
+
+  return aStride;
+}
+
 }
 }