diff --git a/gfx/2d/convolver.cpp b/gfx/2d/convolver.cpp index b02552d052b1..b4a23133f36f 100644 --- a/gfx/2d/convolver.cpp +++ b/gfx/2d/convolver.cpp @@ -175,11 +175,11 @@ class CircularRowBuffer { // |src_data| and continues for the [begin, end) of the filter. template void ConvolveHorizontally(const unsigned char* src_data, - int begin, int end, const ConvolutionFilter1D& filter, unsigned char* out_row) { + int num_values = filter.num_values(); // Loop over each pixel on this row in the output image. - for (int out_x = begin; out_x < end; out_x++) { + for (int out_x = 0; out_x < num_values; out_x++) { // Get the filter that determines the current output pixel. int filter_offset, filter_length; const ConvolutionFilter1D::Fixed* filter_values = @@ -220,17 +220,18 @@ void ConvolveHorizontally(const unsigned char* src_data, // Does vertical convolution to produce one output row. The filter values and // length are given in the first two parameters. These are applied to each // of the rows pointed to in the |source_data_rows| array, with each row -// being |end - begin| wide. +// being |pixel_width| wide. // -// The output must have room for |(end - begin) * 4| bytes. +// The output must have room for |pixel_width * 4| bytes. template void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values, int filter_length, unsigned char* const* source_data_rows, - int begin, int end, unsigned char* out_row) { + int pixel_width, + unsigned char* out_row) { // We go through each column in the output and do a vertical convolution, // generating one output pixel each time. - for (int out_x = begin; out_x < end; out_x++) { + for (int out_x = 0; out_x < pixel_width; out_x++) { // Compute the number of bytes over in each row that the current column // we're convolving starts at. The pixel will cover the next 4 bytes. int byte_offset = out_x * 4; @@ -288,28 +289,29 @@ void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values, void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values, int filter_length, unsigned char* const* source_data_rows, - int width, unsigned char* out_row, + int pixel_width, unsigned char* out_row, bool has_alpha, bool use_simd) { - int processed = 0; #if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A) // If the binary was not built with SSE2 support, we had to fallback to C version. - int simd_width = width & ~3; - if (use_simd && simd_width) { + if (use_simd) { ConvolveVertically_SIMD(filter_values, filter_length, - source_data_rows, 0, simd_width, + source_data_rows, + pixel_width, out_row, has_alpha); - processed = simd_width; - } + } else #endif - - if (width > processed) { + { if (has_alpha) { - ConvolveVertically(filter_values, filter_length, source_data_rows, - processed, width, out_row); + ConvolveVertically(filter_values, filter_length, + source_data_rows, + pixel_width, + out_row); } else { - ConvolveVertically(filter_values, filter_length, source_data_rows, - processed, width, out_row); + ConvolveVertically(filter_values, filter_length, + source_data_rows, + pixel_width, + out_row); } } } @@ -326,16 +328,16 @@ void ConvolveHorizontally(const unsigned char* src_data, // SIMD implementation works with 4 pixels at a time. // Therefore we process as much as we can using SSE and then use // C implementation for leftovers - ConvolveHorizontally_SSE2(src_data, 0, simd_width, filter, out_row); + ConvolveHorizontally_SSE2(src_data, filter, out_row); processed = simd_width; } #endif if (width > processed) { if (has_alpha) { - ConvolveHorizontally(src_data, processed, width, filter, out_row); + ConvolveHorizontally(src_data, filter, out_row); } else { - ConvolveHorizontally(src_data, processed, width, filter, out_row); + ConvolveHorizontally(src_data, filter, out_row); } } } @@ -457,9 +459,23 @@ void BGRAConvolve2D(const unsigned char* source_data, int num_output_rows = filter_y.num_values(); int pixel_width = filter_x.num_values(); + // We need to check which is the last line to convolve before we advance 4 // lines in one iteration. int last_filter_offset, last_filter_length; + // SSE2 can access up to 3 extra pixels past the end of the + // buffer. At the bottom of the image, we have to be careful + // not to access data past the end of the buffer. Normally + // we fall back to the C++ implementation for the last row. + // If the last row is less than 3 pixels wide, we may have to fall + // back to the C++ version for more rows. Compute how many + // rows we need to avoid the SSE implementation for here. + filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset, + &last_filter_length); +#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A) + int avoid_simd_rows = 1 + 3 / + (last_filter_offset + last_filter_length); +#endif filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset, &last_filter_length); @@ -473,36 +489,32 @@ void BGRAConvolve2D(const unsigned char* source_data, // We don't want to process too much rows in batches of 4 because // we can go out-of-bounds at the end while (next_x_row < filter_offset + filter_length) { - if (next_x_row + 3 < last_filter_offset + last_filter_length - 3) { + if (next_x_row + 3 < last_filter_offset + last_filter_length - + avoid_simd_rows) { const unsigned char* src[4]; unsigned char* out_row[4]; for (int i = 0; i < 4; ++i) { src[i] = &source_data[(next_x_row + i) * source_byte_row_stride]; out_row[i] = row_buffer.AdvanceRow(); } - ConvolveHorizontally4_SIMD(src, 0, pixel_width, filter_x, out_row); + ConvolveHorizontally4_SIMD(src, filter_x, out_row); next_x_row += 4; } else { - unsigned char* buffer = row_buffer.AdvanceRow(); - - // For last rows, SSE2 load possibly to access data beyond the - // image area. therefore we use cobined C+SSE version here - int simd_width = pixel_width & ~3; - if (simd_width) { + // Check if we need to avoid SSE2 for this row. + if (next_x_row < last_filter_offset + last_filter_length - + avoid_simd_rows) { ConvolveHorizontally_SIMD( &source_data[next_x_row * source_byte_row_stride], - 0, simd_width, filter_x, buffer); - } - - if (pixel_width > simd_width) { + filter_x, row_buffer.AdvanceRow()); + } else { if (source_has_alpha) { ConvolveHorizontally( &source_data[next_x_row * source_byte_row_stride], - simd_width, pixel_width, filter_x, buffer); + filter_x, row_buffer.AdvanceRow()); } else { ConvolveHorizontally( &source_data[next_x_row * source_byte_row_stride], - simd_width, pixel_width, filter_x, buffer); + filter_x, row_buffer.AdvanceRow()); } } next_x_row++; @@ -513,12 +525,12 @@ void BGRAConvolve2D(const unsigned char* source_data, while (next_x_row < filter_offset + filter_length) { if (source_has_alpha) { ConvolveHorizontally( - &source_data[next_x_row * source_byte_row_stride], - 0, pixel_width, filter_x, row_buffer.AdvanceRow()); + &source_data[next_x_row * source_byte_row_stride], + filter_x, row_buffer.AdvanceRow()); } else { ConvolveHorizontally( - &source_data[next_x_row * source_byte_row_stride], - 0, pixel_width, filter_x, row_buffer.AdvanceRow()); + &source_data[next_x_row * source_byte_row_stride], + filter_x, row_buffer.AdvanceRow()); } next_x_row++; } diff --git a/gfx/2d/convolverSSE2.cpp b/gfx/2d/convolverSSE2.cpp index a1b223703eee..20ddd45f1fbe 100644 --- a/gfx/2d/convolverSSE2.cpp +++ b/gfx/2d/convolverSSE2.cpp @@ -35,26 +35,24 @@ namespace skia { // Convolves horizontally along a single row. The row data is given in -// |src_data| and continues for the [begin, end) of the filter. +// |src_data| and continues for the num_values() of the filter. void ConvolveHorizontally_SSE2(const unsigned char* src_data, - int begin, int end, const ConvolutionFilter1D& filter, unsigned char* out_row) { + int num_values = filter.num_values(); int filter_offset, filter_length; __m128i zero = _mm_setzero_si128(); - __m128i mask[3]; + __m128i mask[4]; // |mask| will be used to decimate all extra filter coefficients that are // loaded by SIMD when |filter_length| is not divisible by 4. - mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); - mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); - mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); - - // This buffer is used for tails - __m128i buffer; + // mask[0] is not used in following algorithm. + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int out_x = begin; out_x < end; out_x++) { + for (int out_x = 0; out_x < num_values; out_x++) { const ConvolutionFilter1D::Fixed* filter_values = filter.FilterForValue(out_x, &filter_offset, &filter_length); @@ -117,22 +115,21 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data, // When |filter_length| is not divisible by 4, we need to decimate some of // the filter coefficient that was loaded incorrectly to zero; Other than - // that the algorithm is same with above, except that the 4th pixel will be + // that the algorithm is same with above, exceot that the 4th pixel will be // always absent. - int r = filter_length & 3; + int r = filter_length&3; if (r) { - memcpy(&buffer, row_to_filter, r * 4); // Note: filter_values must be padded to align_up(filter_offset, 8). __m128i coeff, coeff16; coeff = _mm_loadl_epi64(reinterpret_cast(filter_values)); // Mask out extra filter taps. - coeff = _mm_and_si128(coeff, mask[r-1]); + coeff = _mm_and_si128(coeff, mask[r]); coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); // Note: line buffer must be padded to align_up(filter_offset, 16). - // We resolve this by temporary buffer - __m128i src8 = _mm_loadu_si128(&buffer); + // We resolve this by use C-version for the last horizontal line. + __m128i src8 = _mm_loadu_si128(row_to_filter); __m128i src16 = _mm_unpacklo_epi8(src8, zero); __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); @@ -165,24 +162,26 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data, } // Convolves horizontally along four rows. The row data is given in -// |src_data| and continues for the [begin, end) of the filter. +// |src_data| and continues for the num_values() of the filter. // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please // refer to that function for detailed comments. void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], - int begin, int end, const ConvolutionFilter1D& filter, unsigned char* out_row[4]) { + int num_values = filter.num_values(); + int filter_offset, filter_length; __m128i zero = _mm_setzero_si128(); - __m128i mask[3]; + __m128i mask[4]; // |mask| will be used to decimate all extra filter coefficients that are // loaded by SIMD when |filter_length| is not divisible by 4. - mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); - mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); - mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); + // mask[0] is not used in following algorithm. + mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); + mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); + mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); // Output one pixel each iteration, calculating all channels (RGBA) together. - for (int out_x = begin; out_x < end; out_x++) { + for (int out_x = 0; out_x < num_values; out_x++) { const ConvolutionFilter1D::Fixed* filter_values = filter.FilterForValue(out_x, &filter_offset, &filter_length); @@ -240,7 +239,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], __m128i coeff; coeff = _mm_loadl_epi64(reinterpret_cast(filter_values)); // Mask out extra filter taps. - coeff = _mm_and_si128(coeff, mask[r-1]); + coeff = _mm_and_si128(coeff, mask[r]); __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); /* c1 c1 c1 c1 c0 c0 c0 c0 */ @@ -284,21 +283,22 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], // Does vertical convolution to produce one output row. The filter values and // length are given in the first two parameters. These are applied to each // of the rows pointed to in the |source_data_rows| array, with each row -// being |end - begin| wide. +// being |pixel_width| wide. // -// The output must have room for |(end - begin) * 4| bytes. +// The output must have room for |pixel_width * 4| bytes. template void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_values, int filter_length, unsigned char* const* source_data_rows, - int begin, int end, + int pixel_width, unsigned char* out_row) { + int width = pixel_width & ~3; + __m128i zero = _mm_setzero_si128(); __m128i accum0, accum1, accum2, accum3, coeff16; const __m128i* src; - int out_x; // Output four pixels per iteration (16 bytes). - for (out_x = begin; out_x + 3 < end; out_x += 4) { + for (int out_x = 0; out_x < width; out_x += 4) { // Accumulated result for each pixel. 32 bits per RGBA channel. accum0 = _mm_setzero_si128(); @@ -391,11 +391,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value // When the width of the output is not divisible by 4, We need to save one // pixel (4 bytes) each time. And also the fourth pixel is always absent. - int r = end - out_x; - if (r > 0) { - // Since accum3 is never used here, we'll use it as a buffer - __m128i *buffer = &accum3; - + if (pixel_width & 3) { accum0 = _mm_setzero_si128(); accum1 = _mm_setzero_si128(); accum2 = _mm_setzero_si128(); @@ -403,9 +399,8 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value coeff16 = _mm_set1_epi16(filter_values[filter_y]); // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 src = reinterpret_cast( - &source_data_rows[filter_y][out_x * 4]); - memcpy(buffer, src, r * 4); - __m128i src8 = _mm_loadu_si128(buffer); + &source_data_rows[filter_y][width<<2]); + __m128i src8 = _mm_loadu_si128(src); // [16] a1 b1 g1 r1 a0 b0 g0 r0 __m128i src16 = _mm_unpacklo_epi8(src8, zero); __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); @@ -451,7 +446,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value accum0 = _mm_or_si128(accum0, mask); } - for (; out_x < end; out_x++) { + for (int out_x = width; out_x < pixel_width; out_x++) { *(reinterpret_cast(out_row)) = _mm_cvtsi128_si32(accum0); accum0 = _mm_srli_si128(accum0, 4); out_row += 4; @@ -462,14 +457,14 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, int filter_length, unsigned char* const* source_data_rows, - int begin, int end, + int pixel_width, unsigned char* out_row, bool has_alpha) { if (has_alpha) { ConvolveVertically_SSE2_impl(filter_values, filter_length, - source_data_rows, begin, end, out_row); + source_data_rows, pixel_width, out_row); } else { ConvolveVertically_SSE2_impl(filter_values, filter_length, - source_data_rows, begin, end, out_row); + source_data_rows, pixel_width, out_row); } } diff --git a/gfx/2d/convolverSSE2.h b/gfx/2d/convolverSSE2.h index 916c1eddd33f..a54ce676bc7b 100644 --- a/gfx/2d/convolverSSE2.h +++ b/gfx/2d/convolverSSE2.h @@ -40,7 +40,6 @@ namespace skia { // Convolves horizontally along a single row. The row data is given in // |src_data| and continues for the [begin, end) of the filter. void ConvolveHorizontally_SSE2(const unsigned char* src_data, - int begin, int end, const ConvolutionFilter1D& filter, unsigned char* out_row); @@ -49,7 +48,6 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data, // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please // refer to that function for detailed comments. void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], - int begin, int end, const ConvolutionFilter1D& filter, unsigned char* out_row[4]); @@ -62,7 +60,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4], void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values, int filter_length, unsigned char* const* source_data_rows, - int begin, int end, + int pixel_width, unsigned char* out_row, bool has_alpha); } // namespace skia diff --git a/image/Downscaler.cpp b/image/Downscaler.cpp index d1fc0512fe99..91a103a1aac0 100644 --- a/image/Downscaler.cpp +++ b/image/Downscaler.cpp @@ -91,7 +91,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize, mYFilter.get()); // Allocate the buffer, which contains scanlines of the original image. - mRowBuffer = MakeUnique(mOriginalSize.width * sizeof(uint32_t)); + // pad by 15 to handle overreads by the simd code + mRowBuffer = MakeUnique(mOriginalSize.width * sizeof(uint32_t) + 15); if (MOZ_UNLIKELY(!mRowBuffer)) { return NS_ERROR_OUT_OF_MEMORY; } @@ -106,7 +107,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize, } bool anyAllocationFailed = false; - const int rowSize = mTargetSize.width * sizeof(uint32_t); + // pad by 15 to handle overreads by the simd code + const int rowSize = mTargetSize.width * sizeof(uint32_t) + 15; for (int32_t i = 0; i < mWindowCapacity; ++i) { mWindow[i] = new uint8_t[rowSize]; anyAllocationFailed = anyAllocationFailed || mWindow[i] == nullptr;