diff --git a/gfx/2d/convolver.cpp b/gfx/2d/convolver.cpp
index b02552d052b1..b4a23133f36f 100644
--- a/gfx/2d/convolver.cpp
+++ b/gfx/2d/convolver.cpp
@@ -175,11 +175,11 @@ class CircularRowBuffer {
 // |src_data| and continues for the [begin, end) of the filter.
 template<bool has_alpha>
 void ConvolveHorizontally(const unsigned char* src_data,
-                          int begin, int end,
                           const ConvolutionFilter1D& filter,
                           unsigned char* out_row) {
+  int num_values = filter.num_values();
   // Loop over each pixel on this row in the output image.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < num_values; out_x++) {
     // Get the filter that determines the current output pixel.
     int filter_offset, filter_length;
     const ConvolutionFilter1D::Fixed* filter_values =
@@ -220,17 +220,18 @@ void ConvolveHorizontally(const unsigned char* src_data,
 // Does vertical convolution to produce one output row. The filter values and
 // length are given in the first two parameters. These are applied to each
 // of the rows pointed to in the |source_data_rows| array, with each row
-// being |end - begin| wide.
+// being |pixel_width| wide.
 //
-// The output must have room for |(end - begin) * 4| bytes.
+// The output must have room for |pixel_width * 4| bytes.
 template<bool has_alpha>
 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
                         int filter_length,
                         unsigned char* const* source_data_rows,
-                        int begin, int end, unsigned char* out_row) {
+                        int pixel_width,
+                        unsigned char* out_row) {
   // We go through each column in the output and do a vertical convolution,
   // generating one output pixel each time.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < pixel_width; out_x++) {
     // Compute the number of bytes over in each row that the current column
     // we're convolving starts at. The pixel will cover the next 4 bytes.
     int byte_offset = out_x * 4;
@@ -288,28 +289,29 @@ void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
                         int filter_length,
                         unsigned char* const* source_data_rows,
-                        int width, unsigned char* out_row,
+                        int pixel_width, unsigned char* out_row,
                         bool has_alpha, bool use_simd) {
-  int processed = 0;
 
 #if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
   // If the binary was not built with SSE2 support, we had to fallback to C version.
-  int simd_width = width & ~3;
-  if (use_simd && simd_width) {
+  if (use_simd) {
     ConvolveVertically_SIMD(filter_values, filter_length,
-                            source_data_rows, 0, simd_width,
+                            source_data_rows,
+                            pixel_width,
                             out_row, has_alpha);
-    processed = simd_width;
-  }
+  } else
 #endif
-    
-  if (width > processed) {
+  {
     if (has_alpha) {
-      ConvolveVertically<true>(filter_values, filter_length, source_data_rows,
-                               processed, width, out_row);
+      ConvolveVertically<true>(filter_values, filter_length,
+                               source_data_rows,
+                               pixel_width,
+                               out_row);
     } else {
-      ConvolveVertically<false>(filter_values, filter_length, source_data_rows,
-                                processed, width, out_row);
+      ConvolveVertically<false>(filter_values, filter_length,
+                                source_data_rows,
+                                pixel_width,
+                                out_row);
     }
   }
 }
@@ -326,16 +328,16 @@ void ConvolveHorizontally(const unsigned char* src_data,
     // SIMD implementation works with 4 pixels at a time.
     // Therefore we process as much as we can using SSE and then use
     // C implementation for leftovers
-    ConvolveHorizontally_SSE2(src_data, 0, simd_width, filter, out_row);
+    ConvolveHorizontally_SSE2(src_data, filter, out_row);
     processed = simd_width;
   }
 #endif
 
   if (width > processed) {
     if (has_alpha) {
-      ConvolveHorizontally<true>(src_data, processed, width, filter, out_row);
+      ConvolveHorizontally<true>(src_data, filter, out_row);
     } else {
-      ConvolveHorizontally<false>(src_data, processed, width, filter, out_row);
+      ConvolveHorizontally<false>(src_data, filter, out_row);
     }
   }
 }
@@ -457,9 +459,23 @@ void BGRAConvolve2D(const unsigned char* source_data,
   int num_output_rows = filter_y.num_values();
   int pixel_width = filter_x.num_values();
 
+
   // We need to check which is the last line to convolve before we advance 4
   // lines in one iteration.
   int last_filter_offset, last_filter_length;
+  // SSE2 can access up to 3 extra pixels past the end of the
+  // buffer. At the bottom of the image, we have to be careful
+  // not to access data past the end of the buffer. Normally
+  // we fall back to the C++ implementation for the last row.
+  // If the last row is less than 3 pixels wide, we may have to fall
+  // back to the C++ version for more rows. Compute how many
+  // rows we need to avoid the SSE implementation for here.
+  filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,
+                          &last_filter_length);
+#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
+  int avoid_simd_rows = 1 + 3 /
+      (last_filter_offset + last_filter_length);
+#endif
   filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
                           &last_filter_length);
 
@@ -473,36 +489,32 @@ void BGRAConvolve2D(const unsigned char* source_data,
       // We don't want to process too much rows in batches of 4 because
       // we can go out-of-bounds at the end
       while (next_x_row < filter_offset + filter_length) {
-        if (next_x_row + 3 < last_filter_offset + last_filter_length - 3) {
+        if (next_x_row + 3 < last_filter_offset + last_filter_length -
+            avoid_simd_rows) {
           const unsigned char* src[4];
           unsigned char* out_row[4];
           for (int i = 0; i < 4; ++i) {
             src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
             out_row[i] = row_buffer.AdvanceRow();
           }
-          ConvolveHorizontally4_SIMD(src, 0, pixel_width, filter_x, out_row);
+          ConvolveHorizontally4_SIMD(src, filter_x, out_row);
           next_x_row += 4;
         } else {
-          unsigned char* buffer = row_buffer.AdvanceRow();
-
-          // For last rows, SSE2 load possibly to access data beyond the
-          // image area. therefore we use cobined C+SSE version here
-          int simd_width = pixel_width & ~3;
-          if (simd_width) {
+          // Check if we need to avoid SSE2 for this row.
+          if (next_x_row < last_filter_offset + last_filter_length -
+              avoid_simd_rows) {
             ConvolveHorizontally_SIMD(
                 &source_data[next_x_row * source_byte_row_stride],
-                0, simd_width, filter_x, buffer);
-          }
-
-          if (pixel_width > simd_width) {
+                filter_x, row_buffer.AdvanceRow());
+          } else {
             if (source_has_alpha) {
               ConvolveHorizontally<true>(
                   &source_data[next_x_row * source_byte_row_stride],
-                  simd_width, pixel_width, filter_x, buffer);
+                  filter_x, row_buffer.AdvanceRow());
             } else {
               ConvolveHorizontally<false>(
                   &source_data[next_x_row * source_byte_row_stride],
-                  simd_width, pixel_width, filter_x, buffer);
+                  filter_x, row_buffer.AdvanceRow());
             }
           }
           next_x_row++;
@@ -513,12 +525,12 @@ void BGRAConvolve2D(const unsigned char* source_data,
       while (next_x_row < filter_offset + filter_length) {
         if (source_has_alpha) {
           ConvolveHorizontally<true>(
-              &source_data[next_x_row * source_byte_row_stride],
-              0, pixel_width, filter_x, row_buffer.AdvanceRow());
+                                     &source_data[next_x_row * source_byte_row_stride],
+                                     filter_x, row_buffer.AdvanceRow());
         } else {
           ConvolveHorizontally<false>(
-              &source_data[next_x_row * source_byte_row_stride],
-              0, pixel_width, filter_x, row_buffer.AdvanceRow());
+                                      &source_data[next_x_row * source_byte_row_stride],
+                                      filter_x, row_buffer.AdvanceRow());
         }
         next_x_row++;
       }
diff --git a/gfx/2d/convolverSSE2.cpp b/gfx/2d/convolverSSE2.cpp
index a1b223703eee..20ddd45f1fbe 100644
--- a/gfx/2d/convolverSSE2.cpp
+++ b/gfx/2d/convolverSSE2.cpp
@@ -35,26 +35,24 @@
 namespace skia {
 
 // Convolves horizontally along a single row. The row data is given in
-// |src_data| and continues for the [begin, end) of the filter.
+// |src_data| and continues for the num_values() of the filter.
 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
-                               int begin, int end,
                                const ConvolutionFilter1D& filter,
                                unsigned char* out_row) {
+  int num_values = filter.num_values();
 
   int filter_offset, filter_length;
   __m128i zero = _mm_setzero_si128();
-  __m128i mask[3];
+  __m128i mask[4];
   // |mask| will be used to decimate all extra filter coefficients that are
   // loaded by SIMD when |filter_length| is not divisible by 4.
-  mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
-
-  // This buffer is used for tails
-  __m128i buffer;
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
 
   // Output one pixel each iteration, calculating all channels (RGBA) together.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < num_values; out_x++) {
     const ConvolutionFilter1D::Fixed* filter_values =
         filter.FilterForValue(out_x, &filter_offset, &filter_length);
 
@@ -117,22 +115,21 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
 
     // When |filter_length| is not divisible by 4, we need to decimate some of
     // the filter coefficient that was loaded incorrectly to zero; Other than
-    // that the algorithm is same with above, except that the 4th pixel will be
+    // that the algorithm is same with above, exceot that the 4th pixel will be
     // always absent.
-    int r = filter_length & 3;
+    int r = filter_length&3;
     if (r) {
-      memcpy(&buffer, row_to_filter, r * 4);
       // Note: filter_values must be padded to align_up(filter_offset, 8).
       __m128i coeff, coeff16;
       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
       // Mask out extra filter taps.
-      coeff = _mm_and_si128(coeff, mask[r-1]);
+      coeff = _mm_and_si128(coeff, mask[r]);
       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
 
       // Note: line buffer must be padded to align_up(filter_offset, 16).
-      // We resolve this by temporary buffer 
-      __m128i src8 = _mm_loadu_si128(&buffer);
+      // We resolve this by use C-version for the last horizontal line.
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
@@ -165,24 +162,26 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
 }
 
 // Convolves horizontally along four rows. The row data is given in
-// |src_data| and continues for the [begin, end) of the filter.
+// |src_data| and continues for the num_values() of the filter.
 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
 // refer to that function for detailed comments.
 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
-                                int begin, int end,
                                 const ConvolutionFilter1D& filter,
                                 unsigned char* out_row[4]) {
+  int num_values = filter.num_values();
+
   int filter_offset, filter_length;
   __m128i zero = _mm_setzero_si128();
-  __m128i mask[3];
+  __m128i mask[4];
   // |mask| will be used to decimate all extra filter coefficients that are
   // loaded by SIMD when |filter_length| is not divisible by 4.
-  mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
 
   // Output one pixel each iteration, calculating all channels (RGBA) together.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < num_values; out_x++) {
     const ConvolutionFilter1D::Fixed* filter_values =
         filter.FilterForValue(out_x, &filter_offset, &filter_length);
 
@@ -240,7 +239,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
       __m128i coeff;
       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
       // Mask out extra filter taps.
-      coeff = _mm_and_si128(coeff, mask[r-1]);
+      coeff = _mm_and_si128(coeff, mask[r]);
 
       __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
       /* c1 c1 c1 c1 c0 c0 c0 c0 */
@@ -284,21 +283,22 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
 // Does vertical convolution to produce one output row. The filter values and
 // length are given in the first two parameters. These are applied to each
 // of the rows pointed to in the |source_data_rows| array, with each row
-// being |end - begin| wide.
+// being |pixel_width| wide.
 //
-// The output must have room for |(end - begin) * 4| bytes.
+// The output must have room for |pixel_width * 4| bytes.
 template<bool has_alpha>
 void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_values,
                                   int filter_length,
                                   unsigned char* const* source_data_rows,
-                                  int begin, int end,
+                                  int pixel_width,
                                   unsigned char* out_row) {
+  int width = pixel_width & ~3;
+
   __m128i zero = _mm_setzero_si128();
   __m128i accum0, accum1, accum2, accum3, coeff16;
   const __m128i* src;
-  int out_x;
   // Output four pixels per iteration (16 bytes).
-  for (out_x = begin; out_x + 3 < end; out_x += 4) {
+  for (int out_x = 0; out_x < width; out_x += 4) {
 
     // Accumulated result for each pixel. 32 bits per RGBA channel.
     accum0 = _mm_setzero_si128();
@@ -391,11 +391,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
 
   // When the width of the output is not divisible by 4, We need to save one
   // pixel (4 bytes) each time. And also the fourth pixel is always absent.
-  int r = end - out_x;
-  if (r > 0) {
-    // Since accum3 is never used here, we'll use it as a buffer
-    __m128i *buffer = &accum3;
-
+  if (pixel_width & 3) {
     accum0 = _mm_setzero_si128();
     accum1 = _mm_setzero_si128();
     accum2 = _mm_setzero_si128();
@@ -403,9 +399,8 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
       coeff16 = _mm_set1_epi16(filter_values[filter_y]);
       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
       src = reinterpret_cast<const __m128i*>(
-          &source_data_rows[filter_y][out_x * 4]);
-      memcpy(buffer, src, r * 4);
-      __m128i src8 = _mm_loadu_si128(buffer);
+          &source_data_rows[filter_y][width<<2]);
+      __m128i src8 = _mm_loadu_si128(src);
       // [16] a1 b1 g1 r1 a0 b0 g0 r0
       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
@@ -451,7 +446,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
       accum0 = _mm_or_si128(accum0, mask);
     }
 
-    for (; out_x < end; out_x++) {
+    for (int out_x = width; out_x < pixel_width; out_x++) {
       *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
       accum0 = _mm_srli_si128(accum0, 4);
       out_row += 4;
@@ -462,14 +457,14 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
                              int filter_length,
                              unsigned char* const* source_data_rows,
-                             int begin, int end,
+                             int pixel_width,
                              unsigned char* out_row, bool has_alpha) {
   if (has_alpha) {
     ConvolveVertically_SSE2_impl<true>(filter_values, filter_length,
-                                       source_data_rows, begin, end, out_row);
+                                       source_data_rows, pixel_width, out_row);
   } else {
     ConvolveVertically_SSE2_impl<false>(filter_values, filter_length,
-                                       source_data_rows, begin, end, out_row);
+                                       source_data_rows, pixel_width, out_row);
   }
 }
 
diff --git a/gfx/2d/convolverSSE2.h b/gfx/2d/convolverSSE2.h
index 916c1eddd33f..a54ce676bc7b 100644
--- a/gfx/2d/convolverSSE2.h
+++ b/gfx/2d/convolverSSE2.h
@@ -40,7 +40,6 @@ namespace skia {
 // Convolves horizontally along a single row. The row data is given in
 // |src_data| and continues for the [begin, end) of the filter.
 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
-                               int begin, int end,
                                const ConvolutionFilter1D& filter,
                                unsigned char* out_row);
 
@@ -49,7 +48,6 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
 // refer to that function for detailed comments.
 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
-                                int begin, int end,
                                 const ConvolutionFilter1D& filter,
                                 unsigned char* out_row[4]);
 
@@ -62,7 +60,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
                              int filter_length,
                              unsigned char* const* source_data_rows,
-                             int begin, int end,
+                             int pixel_width,
                              unsigned char* out_row, bool has_alpha);
 
 }  // namespace skia
diff --git a/image/Downscaler.cpp b/image/Downscaler.cpp
index d1fc0512fe99..91a103a1aac0 100644
--- a/image/Downscaler.cpp
+++ b/image/Downscaler.cpp
@@ -91,7 +91,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
                                mYFilter.get());
 
   // Allocate the buffer, which contains scanlines of the original image.
-  mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t));
+  // pad by 15 to handle overreads by the simd code
+  mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t) + 15);
   if (MOZ_UNLIKELY(!mRowBuffer)) {
     return NS_ERROR_OUT_OF_MEMORY;
   }
@@ -106,7 +107,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
   }
 
   bool anyAllocationFailed = false;
-  const int rowSize = mTargetSize.width * sizeof(uint32_t);
+  // pad by 15 to handle overreads by the simd code
+  const int rowSize = mTargetSize.width * sizeof(uint32_t) + 15;
   for (int32_t i = 0; i < mWindowCapacity; ++i) {
     mWindow[i] = new uint8_t[rowSize];
     anyAllocationFailed = anyAllocationFailed || mWindow[i] == nullptr;