mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-28 15:23:51 +00:00
Bug 1180225. Make convolver more like upstream. r=seth
This fixes uninitialised reads. --HG-- extra : rebase_source : c7f50d9ea7a56d845ccb2f324e3e73fbd2e83003
This commit is contained in:
parent
f9055a2c4d
commit
eb58e4fc03
@ -175,11 +175,11 @@ class CircularRowBuffer {
|
||||
// |src_data| and continues for the [begin, end) of the filter.
|
||||
template<bool has_alpha>
|
||||
void ConvolveHorizontally(const unsigned char* src_data,
|
||||
int begin, int end,
|
||||
const ConvolutionFilter1D& filter,
|
||||
unsigned char* out_row) {
|
||||
int num_values = filter.num_values();
|
||||
// Loop over each pixel on this row in the output image.
|
||||
for (int out_x = begin; out_x < end; out_x++) {
|
||||
for (int out_x = 0; out_x < num_values; out_x++) {
|
||||
// Get the filter that determines the current output pixel.
|
||||
int filter_offset, filter_length;
|
||||
const ConvolutionFilter1D::Fixed* filter_values =
|
||||
@ -220,17 +220,18 @@ void ConvolveHorizontally(const unsigned char* src_data,
|
||||
// Does vertical convolution to produce one output row. The filter values and
|
||||
// length are given in the first two parameters. These are applied to each
|
||||
// of the rows pointed to in the |source_data_rows| array, with each row
|
||||
// being |end - begin| wide.
|
||||
// being |pixel_width| wide.
|
||||
//
|
||||
// The output must have room for |(end - begin) * 4| bytes.
|
||||
// The output must have room for |pixel_width * 4| bytes.
|
||||
template<bool has_alpha>
|
||||
void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
|
||||
int filter_length,
|
||||
unsigned char* const* source_data_rows,
|
||||
int begin, int end, unsigned char* out_row) {
|
||||
int pixel_width,
|
||||
unsigned char* out_row) {
|
||||
// We go through each column in the output and do a vertical convolution,
|
||||
// generating one output pixel each time.
|
||||
for (int out_x = begin; out_x < end; out_x++) {
|
||||
for (int out_x = 0; out_x < pixel_width; out_x++) {
|
||||
// Compute the number of bytes over in each row that the current column
|
||||
// we're convolving starts at. The pixel will cover the next 4 bytes.
|
||||
int byte_offset = out_x * 4;
|
||||
@ -288,28 +289,29 @@ void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
|
||||
void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
|
||||
int filter_length,
|
||||
unsigned char* const* source_data_rows,
|
||||
int width, unsigned char* out_row,
|
||||
int pixel_width, unsigned char* out_row,
|
||||
bool has_alpha, bool use_simd) {
|
||||
int processed = 0;
|
||||
|
||||
#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
|
||||
// If the binary was not built with SSE2 support, we had to fallback to C version.
|
||||
int simd_width = width & ~3;
|
||||
if (use_simd && simd_width) {
|
||||
if (use_simd) {
|
||||
ConvolveVertically_SIMD(filter_values, filter_length,
|
||||
source_data_rows, 0, simd_width,
|
||||
source_data_rows,
|
||||
pixel_width,
|
||||
out_row, has_alpha);
|
||||
processed = simd_width;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
|
||||
if (width > processed) {
|
||||
{
|
||||
if (has_alpha) {
|
||||
ConvolveVertically<true>(filter_values, filter_length, source_data_rows,
|
||||
processed, width, out_row);
|
||||
ConvolveVertically<true>(filter_values, filter_length,
|
||||
source_data_rows,
|
||||
pixel_width,
|
||||
out_row);
|
||||
} else {
|
||||
ConvolveVertically<false>(filter_values, filter_length, source_data_rows,
|
||||
processed, width, out_row);
|
||||
ConvolveVertically<false>(filter_values, filter_length,
|
||||
source_data_rows,
|
||||
pixel_width,
|
||||
out_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -326,16 +328,16 @@ void ConvolveHorizontally(const unsigned char* src_data,
|
||||
// SIMD implementation works with 4 pixels at a time.
|
||||
// Therefore we process as much as we can using SSE and then use
|
||||
// C implementation for leftovers
|
||||
ConvolveHorizontally_SSE2(src_data, 0, simd_width, filter, out_row);
|
||||
ConvolveHorizontally_SSE2(src_data, filter, out_row);
|
||||
processed = simd_width;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (width > processed) {
|
||||
if (has_alpha) {
|
||||
ConvolveHorizontally<true>(src_data, processed, width, filter, out_row);
|
||||
ConvolveHorizontally<true>(src_data, filter, out_row);
|
||||
} else {
|
||||
ConvolveHorizontally<false>(src_data, processed, width, filter, out_row);
|
||||
ConvolveHorizontally<false>(src_data, filter, out_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -457,9 +459,23 @@ void BGRAConvolve2D(const unsigned char* source_data,
|
||||
int num_output_rows = filter_y.num_values();
|
||||
int pixel_width = filter_x.num_values();
|
||||
|
||||
|
||||
// We need to check which is the last line to convolve before we advance 4
|
||||
// lines in one iteration.
|
||||
int last_filter_offset, last_filter_length;
|
||||
// SSE2 can access up to 3 extra pixels past the end of the
|
||||
// buffer. At the bottom of the image, we have to be careful
|
||||
// not to access data past the end of the buffer. Normally
|
||||
// we fall back to the C++ implementation for the last row.
|
||||
// If the last row is less than 3 pixels wide, we may have to fall
|
||||
// back to the C++ version for more rows. Compute how many
|
||||
// rows we need to avoid the SSE implementation for here.
|
||||
filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,
|
||||
&last_filter_length);
|
||||
#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
|
||||
int avoid_simd_rows = 1 + 3 /
|
||||
(last_filter_offset + last_filter_length);
|
||||
#endif
|
||||
filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
|
||||
&last_filter_length);
|
||||
|
||||
@ -473,36 +489,32 @@ void BGRAConvolve2D(const unsigned char* source_data,
|
||||
// We don't want to process too much rows in batches of 4 because
|
||||
// we can go out-of-bounds at the end
|
||||
while (next_x_row < filter_offset + filter_length) {
|
||||
if (next_x_row + 3 < last_filter_offset + last_filter_length - 3) {
|
||||
if (next_x_row + 3 < last_filter_offset + last_filter_length -
|
||||
avoid_simd_rows) {
|
||||
const unsigned char* src[4];
|
||||
unsigned char* out_row[4];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
|
||||
out_row[i] = row_buffer.AdvanceRow();
|
||||
}
|
||||
ConvolveHorizontally4_SIMD(src, 0, pixel_width, filter_x, out_row);
|
||||
ConvolveHorizontally4_SIMD(src, filter_x, out_row);
|
||||
next_x_row += 4;
|
||||
} else {
|
||||
unsigned char* buffer = row_buffer.AdvanceRow();
|
||||
|
||||
// For last rows, SSE2 load possibly to access data beyond the
|
||||
// image area. therefore we use cobined C+SSE version here
|
||||
int simd_width = pixel_width & ~3;
|
||||
if (simd_width) {
|
||||
// Check if we need to avoid SSE2 for this row.
|
||||
if (next_x_row < last_filter_offset + last_filter_length -
|
||||
avoid_simd_rows) {
|
||||
ConvolveHorizontally_SIMD(
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
0, simd_width, filter_x, buffer);
|
||||
}
|
||||
|
||||
if (pixel_width > simd_width) {
|
||||
filter_x, row_buffer.AdvanceRow());
|
||||
} else {
|
||||
if (source_has_alpha) {
|
||||
ConvolveHorizontally<true>(
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
simd_width, pixel_width, filter_x, buffer);
|
||||
filter_x, row_buffer.AdvanceRow());
|
||||
} else {
|
||||
ConvolveHorizontally<false>(
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
simd_width, pixel_width, filter_x, buffer);
|
||||
filter_x, row_buffer.AdvanceRow());
|
||||
}
|
||||
}
|
||||
next_x_row++;
|
||||
@ -513,12 +525,12 @@ void BGRAConvolve2D(const unsigned char* source_data,
|
||||
while (next_x_row < filter_offset + filter_length) {
|
||||
if (source_has_alpha) {
|
||||
ConvolveHorizontally<true>(
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
0, pixel_width, filter_x, row_buffer.AdvanceRow());
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
filter_x, row_buffer.AdvanceRow());
|
||||
} else {
|
||||
ConvolveHorizontally<false>(
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
0, pixel_width, filter_x, row_buffer.AdvanceRow());
|
||||
&source_data[next_x_row * source_byte_row_stride],
|
||||
filter_x, row_buffer.AdvanceRow());
|
||||
}
|
||||
next_x_row++;
|
||||
}
|
||||
|
@ -35,26 +35,24 @@
|
||||
namespace skia {
|
||||
|
||||
// Convolves horizontally along a single row. The row data is given in
|
||||
// |src_data| and continues for the [begin, end) of the filter.
|
||||
// |src_data| and continues for the num_values() of the filter.
|
||||
void ConvolveHorizontally_SSE2(const unsigned char* src_data,
|
||||
int begin, int end,
|
||||
const ConvolutionFilter1D& filter,
|
||||
unsigned char* out_row) {
|
||||
int num_values = filter.num_values();
|
||||
|
||||
int filter_offset, filter_length;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i mask[3];
|
||||
__m128i mask[4];
|
||||
// |mask| will be used to decimate all extra filter coefficients that are
|
||||
// loaded by SIMD when |filter_length| is not divisible by 4.
|
||||
mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
|
||||
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
|
||||
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
|
||||
|
||||
// This buffer is used for tails
|
||||
__m128i buffer;
|
||||
// mask[0] is not used in following algorithm.
|
||||
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
|
||||
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
|
||||
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
|
||||
|
||||
// Output one pixel each iteration, calculating all channels (RGBA) together.
|
||||
for (int out_x = begin; out_x < end; out_x++) {
|
||||
for (int out_x = 0; out_x < num_values; out_x++) {
|
||||
const ConvolutionFilter1D::Fixed* filter_values =
|
||||
filter.FilterForValue(out_x, &filter_offset, &filter_length);
|
||||
|
||||
@ -117,22 +115,21 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
|
||||
|
||||
// When |filter_length| is not divisible by 4, we need to decimate some of
|
||||
// the filter coefficient that was loaded incorrectly to zero; Other than
|
||||
// that the algorithm is same with above, except that the 4th pixel will be
|
||||
// that the algorithm is same with above, exceot that the 4th pixel will be
|
||||
// always absent.
|
||||
int r = filter_length & 3;
|
||||
int r = filter_length&3;
|
||||
if (r) {
|
||||
memcpy(&buffer, row_to_filter, r * 4);
|
||||
// Note: filter_values must be padded to align_up(filter_offset, 8).
|
||||
__m128i coeff, coeff16;
|
||||
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
|
||||
// Mask out extra filter taps.
|
||||
coeff = _mm_and_si128(coeff, mask[r-1]);
|
||||
coeff = _mm_and_si128(coeff, mask[r]);
|
||||
coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
|
||||
|
||||
// Note: line buffer must be padded to align_up(filter_offset, 16).
|
||||
// We resolve this by temporary buffer
|
||||
__m128i src8 = _mm_loadu_si128(&buffer);
|
||||
// We resolve this by use C-version for the last horizontal line.
|
||||
__m128i src8 = _mm_loadu_si128(row_to_filter);
|
||||
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
|
||||
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
|
||||
__m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
|
||||
@ -165,24 +162,26 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
|
||||
}
|
||||
|
||||
// Convolves horizontally along four rows. The row data is given in
|
||||
// |src_data| and continues for the [begin, end) of the filter.
|
||||
// |src_data| and continues for the num_values() of the filter.
|
||||
// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
|
||||
// refer to that function for detailed comments.
|
||||
void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
|
||||
int begin, int end,
|
||||
const ConvolutionFilter1D& filter,
|
||||
unsigned char* out_row[4]) {
|
||||
int num_values = filter.num_values();
|
||||
|
||||
int filter_offset, filter_length;
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i mask[3];
|
||||
__m128i mask[4];
|
||||
// |mask| will be used to decimate all extra filter coefficients that are
|
||||
// loaded by SIMD when |filter_length| is not divisible by 4.
|
||||
mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
|
||||
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
|
||||
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
|
||||
// mask[0] is not used in following algorithm.
|
||||
mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
|
||||
mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
|
||||
mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
|
||||
|
||||
// Output one pixel each iteration, calculating all channels (RGBA) together.
|
||||
for (int out_x = begin; out_x < end; out_x++) {
|
||||
for (int out_x = 0; out_x < num_values; out_x++) {
|
||||
const ConvolutionFilter1D::Fixed* filter_values =
|
||||
filter.FilterForValue(out_x, &filter_offset, &filter_length);
|
||||
|
||||
@ -240,7 +239,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
|
||||
__m128i coeff;
|
||||
coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
|
||||
// Mask out extra filter taps.
|
||||
coeff = _mm_and_si128(coeff, mask[r-1]);
|
||||
coeff = _mm_and_si128(coeff, mask[r]);
|
||||
|
||||
__m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
|
||||
/* c1 c1 c1 c1 c0 c0 c0 c0 */
|
||||
@ -284,21 +283,22 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
|
||||
// Does vertical convolution to produce one output row. The filter values and
|
||||
// length are given in the first two parameters. These are applied to each
|
||||
// of the rows pointed to in the |source_data_rows| array, with each row
|
||||
// being |end - begin| wide.
|
||||
// being |pixel_width| wide.
|
||||
//
|
||||
// The output must have room for |(end - begin) * 4| bytes.
|
||||
// The output must have room for |pixel_width * 4| bytes.
|
||||
template<bool has_alpha>
|
||||
void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_values,
|
||||
int filter_length,
|
||||
unsigned char* const* source_data_rows,
|
||||
int begin, int end,
|
||||
int pixel_width,
|
||||
unsigned char* out_row) {
|
||||
int width = pixel_width & ~3;
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i accum0, accum1, accum2, accum3, coeff16;
|
||||
const __m128i* src;
|
||||
int out_x;
|
||||
// Output four pixels per iteration (16 bytes).
|
||||
for (out_x = begin; out_x + 3 < end; out_x += 4) {
|
||||
for (int out_x = 0; out_x < width; out_x += 4) {
|
||||
|
||||
// Accumulated result for each pixel. 32 bits per RGBA channel.
|
||||
accum0 = _mm_setzero_si128();
|
||||
@ -391,11 +391,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
|
||||
|
||||
// When the width of the output is not divisible by 4, We need to save one
|
||||
// pixel (4 bytes) each time. And also the fourth pixel is always absent.
|
||||
int r = end - out_x;
|
||||
if (r > 0) {
|
||||
// Since accum3 is never used here, we'll use it as a buffer
|
||||
__m128i *buffer = &accum3;
|
||||
|
||||
if (pixel_width & 3) {
|
||||
accum0 = _mm_setzero_si128();
|
||||
accum1 = _mm_setzero_si128();
|
||||
accum2 = _mm_setzero_si128();
|
||||
@ -403,9 +399,8 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
|
||||
coeff16 = _mm_set1_epi16(filter_values[filter_y]);
|
||||
// [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
|
||||
src = reinterpret_cast<const __m128i*>(
|
||||
&source_data_rows[filter_y][out_x * 4]);
|
||||
memcpy(buffer, src, r * 4);
|
||||
__m128i src8 = _mm_loadu_si128(buffer);
|
||||
&source_data_rows[filter_y][width<<2]);
|
||||
__m128i src8 = _mm_loadu_si128(src);
|
||||
// [16] a1 b1 g1 r1 a0 b0 g0 r0
|
||||
__m128i src16 = _mm_unpacklo_epi8(src8, zero);
|
||||
__m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
|
||||
@ -451,7 +446,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
|
||||
accum0 = _mm_or_si128(accum0, mask);
|
||||
}
|
||||
|
||||
for (; out_x < end; out_x++) {
|
||||
for (int out_x = width; out_x < pixel_width; out_x++) {
|
||||
*(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
|
||||
accum0 = _mm_srli_si128(accum0, 4);
|
||||
out_row += 4;
|
||||
@ -462,14 +457,14 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
|
||||
void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
|
||||
int filter_length,
|
||||
unsigned char* const* source_data_rows,
|
||||
int begin, int end,
|
||||
int pixel_width,
|
||||
unsigned char* out_row, bool has_alpha) {
|
||||
if (has_alpha) {
|
||||
ConvolveVertically_SSE2_impl<true>(filter_values, filter_length,
|
||||
source_data_rows, begin, end, out_row);
|
||||
source_data_rows, pixel_width, out_row);
|
||||
} else {
|
||||
ConvolveVertically_SSE2_impl<false>(filter_values, filter_length,
|
||||
source_data_rows, begin, end, out_row);
|
||||
source_data_rows, pixel_width, out_row);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,6 @@ namespace skia {
|
||||
// Convolves horizontally along a single row. The row data is given in
|
||||
// |src_data| and continues for the [begin, end) of the filter.
|
||||
void ConvolveHorizontally_SSE2(const unsigned char* src_data,
|
||||
int begin, int end,
|
||||
const ConvolutionFilter1D& filter,
|
||||
unsigned char* out_row);
|
||||
|
||||
@ -49,7 +48,6 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
|
||||
// The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
|
||||
// refer to that function for detailed comments.
|
||||
void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
|
||||
int begin, int end,
|
||||
const ConvolutionFilter1D& filter,
|
||||
unsigned char* out_row[4]);
|
||||
|
||||
@ -62,7 +60,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
|
||||
void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
|
||||
int filter_length,
|
||||
unsigned char* const* source_data_rows,
|
||||
int begin, int end,
|
||||
int pixel_width,
|
||||
unsigned char* out_row, bool has_alpha);
|
||||
|
||||
} // namespace skia
|
||||
|
@ -91,7 +91,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
|
||||
mYFilter.get());
|
||||
|
||||
// Allocate the buffer, which contains scanlines of the original image.
|
||||
mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t));
|
||||
// pad by 15 to handle overreads by the simd code
|
||||
mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t) + 15);
|
||||
if (MOZ_UNLIKELY(!mRowBuffer)) {
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
@ -106,7 +107,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
|
||||
}
|
||||
|
||||
bool anyAllocationFailed = false;
|
||||
const int rowSize = mTargetSize.width * sizeof(uint32_t);
|
||||
// pad by 15 to handle overreads by the simd code
|
||||
const int rowSize = mTargetSize.width * sizeof(uint32_t) + 15;
|
||||
for (int32_t i = 0; i < mWindowCapacity; ++i) {
|
||||
mWindow[i] = new uint8_t[rowSize];
|
||||
anyAllocationFailed = anyAllocationFailed || mWindow[i] == nullptr;
|
||||
|
Loading…
Reference in New Issue
Block a user