mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-08 10:44:56 +00:00
Bug 509052: Add new, faster blurring code. r=derf
This commit is contained in:
parent
923d906d77
commit
cbb586e21a
258
gfx/2d/Blur.cpp
258
gfx/2d/Blur.cpp
@ -12,6 +12,9 @@
|
||||
#include "mozilla/Constants.h"
|
||||
#include "mozilla/Util.h"
|
||||
|
||||
#include "2D.h"
|
||||
#include "Tools.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace mozilla {
|
||||
@ -311,8 +314,8 @@ SpreadVertical(unsigned char* aInput,
|
||||
}
|
||||
}
|
||||
|
||||
static CheckedInt<int32_t>
|
||||
RoundUpToMultipleOf4(int32_t aVal)
|
||||
CheckedInt<int32_t>
|
||||
AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal)
|
||||
{
|
||||
CheckedInt<int32_t> val(aVal);
|
||||
|
||||
@ -378,10 +381,9 @@ AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect,
|
||||
if (stride.isValid()) {
|
||||
mStride = stride.value();
|
||||
|
||||
CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height *
|
||||
sizeof(unsigned char);
|
||||
CheckedInt<int32_t> size = CheckedInt<int32_t>(mStride) * mRect.height;
|
||||
if (size.isValid()) {
|
||||
mData = static_cast<unsigned char*>(malloc(size.value()));
|
||||
mData = new uint8_t[size.value()];
|
||||
memset(mData, 0, size.value());
|
||||
}
|
||||
}
|
||||
@ -405,7 +407,7 @@ AlphaBoxBlur::AlphaBoxBlur(uint8_t* aData,
|
||||
AlphaBoxBlur::~AlphaBoxBlur()
|
||||
{
|
||||
if (mFreeData) {
|
||||
free(mData);
|
||||
delete [] mData;
|
||||
}
|
||||
}
|
||||
|
||||
@ -455,42 +457,236 @@ AlphaBoxBlur::Blur()
|
||||
if (mBlurRadius != IntSize(0,0) || mSpreadRadius != IntSize(0,0)) {
|
||||
int32_t stride = GetStride();
|
||||
|
||||
// No need to use CheckedInt here - we have validated it in the constructor.
|
||||
size_t szB = stride * GetSize().height * sizeof(unsigned char);
|
||||
unsigned char* tmpData = static_cast<unsigned char*>(malloc(szB));
|
||||
if (!tmpData)
|
||||
return; // OOM
|
||||
|
||||
memset(tmpData, 0, szB);
|
||||
IntSize size = GetSize();
|
||||
|
||||
if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
|
||||
// No need to use CheckedInt here - we have validated it in the constructor.
|
||||
size_t szB = stride * size.height;
|
||||
unsigned char* tmpData = new uint8_t[szB];
|
||||
|
||||
memset(tmpData, 0, szB);
|
||||
|
||||
SpreadHorizontal(mData, tmpData, mSpreadRadius.width, GetSize().width, GetSize().height, stride, mSkipRect);
|
||||
SpreadVertical(tmpData, mData, mSpreadRadius.height, GetSize().width, GetSize().height, stride, mSkipRect);
|
||||
|
||||
delete [] tmpData;
|
||||
}
|
||||
|
||||
if (mBlurRadius.width > 0) {
|
||||
int32_t lobes[3][2];
|
||||
ComputeLobes(mBlurRadius.width, lobes);
|
||||
BoxBlurHorizontal(mData, tmpData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurHorizontal(tmpData, mData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurHorizontal(mData, tmpData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
|
||||
int32_t horizontalLobes[3][2];
|
||||
ComputeLobes(mBlurRadius.width, horizontalLobes);
|
||||
int32_t verticalLobes[3][2];
|
||||
ComputeLobes(mBlurRadius.height, verticalLobes);
|
||||
|
||||
// We want to allow for some extra space on the left for alignment reasons.
|
||||
int32_t maxLeftLobe = RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
|
||||
|
||||
IntSize integralImageSize(size.width + maxLeftLobe + horizontalLobes[1][1],
|
||||
size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
|
||||
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
const bool cIsBigEndian = true;
|
||||
#else
|
||||
const bool cIsBigEndian = false;
|
||||
#endif
|
||||
|
||||
if (cIsBigEndian || (integralImageSize.width * integralImageSize.height) > (1 << 24)) {
|
||||
// Fallback to old blurring code when the surface is so large it may
|
||||
// overflow our integral image!
|
||||
|
||||
// No need to use CheckedInt here - we have validated it in the constructor.
|
||||
size_t szB = stride * size.height;
|
||||
unsigned char* tmpData = new uint8_t[szB];
|
||||
|
||||
memset(tmpData, 0, szB);
|
||||
|
||||
if (mBlurRadius.width > 0) {
|
||||
BoxBlurHorizontal(mData, tmpData, horizontalLobes[0][0], horizontalLobes[0][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurHorizontal(tmpData, mData, horizontalLobes[1][0], horizontalLobes[1][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurHorizontal(mData, tmpData, horizontalLobes[2][0], horizontalLobes[2][1], stride, GetSize().height, mSkipRect);
|
||||
} else {
|
||||
uint8_t *tmp = mData;
|
||||
mData = tmpData;
|
||||
tmpData = tmp;
|
||||
}
|
||||
if (mBlurRadius.height > 0) {
|
||||
BoxBlurVertical(tmpData, mData, verticalLobes[0][0], verticalLobes[0][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurVertical(mData, tmpData, verticalLobes[1][0], verticalLobes[1][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurVertical(tmpData, mData, verticalLobes[2][0], verticalLobes[2][1], stride, GetSize().height, mSkipRect);
|
||||
} else {
|
||||
uint8_t *tmp = mData;
|
||||
mData = tmpData;
|
||||
tmpData = tmp;
|
||||
}
|
||||
|
||||
delete [] tmpData;
|
||||
} else {
|
||||
memcpy(tmpData, mData, stride * GetSize().height);
|
||||
}
|
||||
size_t integralImageStride = GetAlignedStride<16>(integralImageSize.width * 4);
|
||||
|
||||
if (mBlurRadius.height > 0) {
|
||||
int32_t lobes[3][2];
|
||||
ComputeLobes(mBlurRadius.height, lobes);
|
||||
BoxBlurVertical(tmpData, mData, lobes[0][0], lobes[0][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurVertical(mData, tmpData, lobes[1][0], lobes[1][1], stride, GetSize().height, mSkipRect);
|
||||
BoxBlurVertical(tmpData, mData, lobes[2][0], lobes[2][1], stride, GetSize().height, mSkipRect);
|
||||
} else {
|
||||
memcpy(mData, tmpData, stride * GetSize().height);
|
||||
}
|
||||
AlignedArray<uint32_t> integralImage((integralImageStride / 4) * integralImageSize.height);
|
||||
|
||||
free(tmpData);
|
||||
#ifdef USE_SSE2
|
||||
if (Factory::HasSSE2()) {
|
||||
BoxBlur_SSE2(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
|
||||
verticalLobes[0][1], integralImage, integralImageStride);
|
||||
BoxBlur_SSE2(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
|
||||
verticalLobes[1][1], integralImage, integralImageStride);
|
||||
BoxBlur_SSE2(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
|
||||
verticalLobes[2][1], integralImage, integralImageStride);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
BoxBlur_C(horizontalLobes[0][0], horizontalLobes[0][1], verticalLobes[0][0],
|
||||
verticalLobes[0][1], integralImage, integralImageStride);
|
||||
BoxBlur_C(horizontalLobes[1][0], horizontalLobes[1][1], verticalLobes[1][0],
|
||||
verticalLobes[1][1], integralImage, integralImageStride);
|
||||
BoxBlur_C(horizontalLobes[2][0], horizontalLobes[2][1], verticalLobes[2][0],
|
||||
verticalLobes[2][1], integralImage, integralImageStride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE void
|
||||
GenerateIntegralRow(uint32_t *aDest, const uint8_t *aSource, uint32_t *aPreviousRow,
|
||||
const uint32_t &aSourceWidth, const uint32_t &aLeftInflation, const uint32_t &aRightInflation)
|
||||
{
|
||||
uint32_t currentRowSum = 0;
|
||||
uint32_t pixel = aSource[0];
|
||||
for (uint32_t x = 0; x < aLeftInflation; x++) {
|
||||
currentRowSum += pixel;
|
||||
*aDest++ = currentRowSum + *aPreviousRow++;
|
||||
}
|
||||
for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x += 4) {
|
||||
uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
|
||||
currentRowSum += alphaValues & 0xff;
|
||||
*aDest++ = *aPreviousRow++ + currentRowSum;
|
||||
alphaValues >>= 8;
|
||||
currentRowSum += alphaValues & 0xff;
|
||||
*aDest++ = *aPreviousRow++ + currentRowSum;
|
||||
alphaValues >>= 8;
|
||||
currentRowSum += alphaValues & 0xff;
|
||||
*aDest++ = *aPreviousRow++ + currentRowSum;
|
||||
alphaValues >>= 8;
|
||||
currentRowSum += alphaValues & 0xff;
|
||||
*aDest++ = *aPreviousRow++ + currentRowSum;
|
||||
}
|
||||
pixel = aSource[aSourceWidth - 1];
|
||||
for (uint32_t x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
|
||||
currentRowSum += pixel;
|
||||
*aDest++ = currentRowSum + *aPreviousRow++;
|
||||
}
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE void
|
||||
GenerateIntegralImage_C(int32_t aLeftInflation, int32_t aRightInflation,
|
||||
int32_t aTopInflation, int32_t aBottomInflation,
|
||||
uint32_t *aIntegralImage, size_t aIntegralImageStride,
|
||||
uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
|
||||
{
|
||||
uint32_t stride32bit = aIntegralImageStride / 4;
|
||||
|
||||
IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
|
||||
aSize.height + aTopInflation + aBottomInflation);
|
||||
|
||||
memset(aIntegralImage, 0, aIntegralImageStride);
|
||||
|
||||
GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage,
|
||||
aSize.width, aLeftInflation, aRightInflation);
|
||||
for (int y = 1; y < aTopInflation + 1; y++) {
|
||||
uint32_t *intRow = aIntegralImage + (y * stride32bit);
|
||||
uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
|
||||
uint32_t *intFirstRow = aIntegralImage;
|
||||
|
||||
GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource, aIntegralImage + (y - 1) * stride32bit,
|
||||
aSize.width, aLeftInflation, aRightInflation);
|
||||
}
|
||||
|
||||
for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
|
||||
GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + aSourceStride * (y - aTopInflation),
|
||||
aIntegralImage + (y - 1) * stride32bit, aSize.width, aLeftInflation, aRightInflation);
|
||||
}
|
||||
|
||||
if (aBottomInflation) {
|
||||
for (int y = (aSize.height + aTopInflation); y < integralImageSize.height; y++) {
|
||||
GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource + ((aSize.height - 1) * aSourceStride),
|
||||
aIntegralImage + (y - 1) * stride32bit,
|
||||
aSize.width, aLeftInflation, aRightInflation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to do an in-place box blur using an integral image.
|
||||
*/
|
||||
void
|
||||
AlphaBoxBlur::BoxBlur_C(int32_t aLeftLobe,
|
||||
int32_t aRightLobe,
|
||||
int32_t aTopLobe,
|
||||
int32_t aBottomLobe,
|
||||
uint32_t *aIntegralImage,
|
||||
size_t aIntegralImageStride)
|
||||
{
|
||||
IntSize size = GetSize();
|
||||
|
||||
MOZ_ASSERT(size.width > 0);
|
||||
|
||||
// Our 'left' or 'top' lobe will include the current pixel. i.e. when
|
||||
// looking at an integral image the value of a pixel at 'x,y' is calculated
|
||||
// using the value of the integral image values above/below that.
|
||||
aLeftLobe++;
|
||||
aTopLobe++;
|
||||
int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
|
||||
|
||||
MOZ_ASSERT(boxSize > 0);
|
||||
|
||||
if (boxSize == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t stride32bit = aIntegralImageStride / 4;
|
||||
|
||||
int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
|
||||
|
||||
GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
|
||||
aIntegralImage, aIntegralImageStride, mData,
|
||||
mStride, size);
|
||||
|
||||
uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
|
||||
|
||||
uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
|
||||
|
||||
// Storing these locally makes this about 30% faster! Presumably the compiler
|
||||
// can't be sure we're not altering the member variables in this loop.
|
||||
IntRect skipRect = mSkipRect;
|
||||
uint8_t *data = mData;
|
||||
int32_t stride = mStride;
|
||||
for (int32_t y = 0; y < size.height; y++) {
|
||||
bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
|
||||
|
||||
uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
|
||||
uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
|
||||
uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
|
||||
uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
|
||||
|
||||
for (int32_t x = 0; x < size.width; x++) {
|
||||
if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
|
||||
x = skipRect.XMost() - 1;
|
||||
// Trigger early jump on coming loop iterations, this will be reset
|
||||
// next line anyway.
|
||||
inSkipRectY = false;
|
||||
continue;
|
||||
}
|
||||
int32_t topLeft = topLeftBase[x];
|
||||
int32_t topRight = topRightBase[x];
|
||||
int32_t bottomRight = bottomRightBase[x];
|
||||
int32_t bottomLeft = bottomLeftBase[x];
|
||||
|
||||
uint32_t value = bottomRight - topRight - bottomLeft;
|
||||
value += topLeft;
|
||||
|
||||
data[stride * y + x] = (uint64_t(reciprocal) * value) >> 32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#include "mozilla/gfx/Rect.h"
|
||||
#include "mozilla/gfx/Point.h"
|
||||
#include "mozilla/CheckedInt.h"
|
||||
|
||||
namespace mozilla {
|
||||
namespace gfx {
|
||||
@ -114,6 +115,13 @@ public:
|
||||
|
||||
private:
|
||||
|
||||
void BoxBlur_C(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
|
||||
int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
|
||||
void BoxBlur_SSE2(int32_t aLeftLobe, int32_t aRightLobe, int32_t aTopLobe,
|
||||
int32_t aBottomLobe, uint32_t *aIntegralImage, size_t aIntegralImageStride);
|
||||
|
||||
static CheckedInt<int32_t> RoundUpToMultipleOf4(int32_t aVal);
|
||||
|
||||
/**
|
||||
* A rect indicating the area where blurring is unnecessary, and the blur
|
||||
* algorithm should skip over it.
|
||||
|
250
gfx/2d/BlurSSE2.cpp
Normal file
250
gfx/2d/BlurSSE2.cpp
Normal file
@ -0,0 +1,250 @@
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "Blur.h"
|
||||
|
||||
#include "SSEHelpers.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
namespace mozilla {
|
||||
namespace gfx {
|
||||
|
||||
MOZ_ALWAYS_INLINE
|
||||
uint32_t DivideAndPack(__m128i aValues, __m128i aDivisor, __m128i aMask)
|
||||
{
|
||||
__m128i multiplied = _mm_srli_epi64(_mm_mul_epu32(aValues, aDivisor), 32); // 00p300p1
|
||||
multiplied = _mm_or_si128(multiplied, _mm_and_si128(_mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor),
|
||||
aMask)); // p4p3p2p1
|
||||
__m128i final = _mm_packus_epi16(_mm_packs_epi32(multiplied, _mm_setzero_si128()), _mm_setzero_si128());
|
||||
|
||||
return _mm_cvtsi128_si32(final);
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE
|
||||
void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource,
|
||||
int32_t aSourceWidth, int32_t aLeftInflation,
|
||||
int32_t aRightInflation)
|
||||
{
|
||||
int32_t currentRowSum = 0;
|
||||
|
||||
for (int x = 0; x < aLeftInflation; x++) {
|
||||
currentRowSum += aSource[0];
|
||||
aDest[x] = currentRowSum;
|
||||
}
|
||||
for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
|
||||
currentRowSum += aSource[(x - aLeftInflation)];
|
||||
aDest[x] = currentRowSum;
|
||||
}
|
||||
for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
|
||||
currentRowSum += aSource[aSourceWidth - 1];
|
||||
aDest[x] = currentRowSum;
|
||||
}
|
||||
}
|
||||
|
||||
// This function calculates an integral of four pixels stored in the 4
|
||||
// 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
|
||||
// { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
|
||||
// much testing.
|
||||
MOZ_ALWAYS_INLINE
|
||||
__m128i AccumulatePixelSums(__m128i aPixels)
|
||||
{
|
||||
__m128i sumPixels = aPixels;
|
||||
__m128i currentPixels = _mm_slli_si128(aPixels, 4);
|
||||
sumPixels = _mm_add_epi32(sumPixels, currentPixels);
|
||||
currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
|
||||
|
||||
return _mm_add_epi32(sumPixels, currentPixels);
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE void
|
||||
GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation,
|
||||
int32_t aTopInflation, int32_t aBottomInflation,
|
||||
uint32_t *aIntegralImage, size_t aIntegralImageStride,
|
||||
uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize)
|
||||
{
|
||||
MOZ_ASSERT(!(aLeftInflation & 3));
|
||||
|
||||
uint32_t stride32bit = aIntegralImageStride / 4;
|
||||
|
||||
IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
|
||||
aSize.height + aTopInflation + aBottomInflation);
|
||||
|
||||
LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation);
|
||||
|
||||
for (int y = 1; y < aTopInflation + 1; y++) {
|
||||
uint32_t *intRow = aIntegralImage + (y * stride32bit);
|
||||
uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
|
||||
uint32_t *intFirstRow = aIntegralImage;
|
||||
|
||||
for (int x = 0; x < integralImageSize.width; x += 4) {
|
||||
__m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
|
||||
__m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
|
||||
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow));
|
||||
}
|
||||
}
|
||||
|
||||
for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
|
||||
__m128i currentRowSum = _mm_setzero_si128();
|
||||
uint32_t *intRow = aIntegralImage + (y * stride32bit);
|
||||
uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit;
|
||||
uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation);
|
||||
|
||||
uint32_t pixel = sourceRow[0];
|
||||
for (int x = 0; x < aLeftInflation; x += 4) {
|
||||
__m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
|
||||
|
||||
sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
|
||||
|
||||
currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
|
||||
}
|
||||
for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
|
||||
uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
|
||||
|
||||
// It's important to shuffle here. When we exit this loop currentRowSum
|
||||
// has to be set to sumPixels, so that the following loop can get the
|
||||
// correct pixel for the currentRowSum. The highest order pixel in
|
||||
// currentRowSum could've originated from accumulation in the stride.
|
||||
currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
__m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128()));
|
||||
sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
|
||||
|
||||
currentRowSum = sumPixels;
|
||||
|
||||
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
|
||||
}
|
||||
|
||||
pixel = sourceRow[aSize.width - 1];
|
||||
int x = (aSize.width + aLeftInflation);
|
||||
if ((aSize.width & 3)) {
|
||||
// Deal with unaligned portion. Get the correct pixel from currentRowSum,
|
||||
// see explanation above.
|
||||
uint32_t intCurrentRowSum = ((uint32_t*)¤tRowSum)[(aSize.width % 4) - 1];
|
||||
for (; x < integralImageSize.width; x++) {
|
||||
// We could be unaligned here!
|
||||
if (!(x & 3)) {
|
||||
// aligned!
|
||||
currentRowSum = _mm_set1_epi32(intCurrentRowSum);
|
||||
break;
|
||||
}
|
||||
intCurrentRowSum += pixel;
|
||||
intRow[x] = intPrevRow[x] + intCurrentRowSum;
|
||||
}
|
||||
} else {
|
||||
currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
}
|
||||
for (; x < integralImageSize.width; x += 4) {
|
||||
__m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
|
||||
|
||||
sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
|
||||
|
||||
currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
|
||||
_mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
|
||||
}
|
||||
}
|
||||
|
||||
if (aBottomInflation) {
|
||||
// Store the last valid row of our source image in the last row of
|
||||
// our integral image. This will be overwritten with the correct values
|
||||
// in the upcoming loop.
|
||||
LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit,
|
||||
aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation);
|
||||
|
||||
|
||||
for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) {
|
||||
__m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
|
||||
__m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
|
||||
__m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit);
|
||||
|
||||
for (int x = 0; x < integralImageSize.width; x += 4) {
|
||||
_mm_store_si128(intRow + (x / 4),
|
||||
_mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
|
||||
_mm_load_si128(intPrevRow + (x / 4))));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to do an in-place box blur using an integral image.
|
||||
*/
|
||||
void
|
||||
AlphaBoxBlur::BoxBlur_SSE2(int32_t aLeftLobe,
|
||||
int32_t aRightLobe,
|
||||
int32_t aTopLobe,
|
||||
int32_t aBottomLobe,
|
||||
uint32_t *aIntegralImage,
|
||||
size_t aIntegralImageStride)
|
||||
{
|
||||
IntSize size = GetSize();
|
||||
|
||||
MOZ_ASSERT(size.height > 0);
|
||||
|
||||
// Our 'left' or 'top' lobe will include the current pixel. i.e. when
|
||||
// looking at an integral image the value of a pixel at 'x,y' is calculated
|
||||
// using the value of the integral image values above/below that.
|
||||
aLeftLobe++;
|
||||
aTopLobe++;
|
||||
int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
|
||||
|
||||
MOZ_ASSERT(boxSize > 0);
|
||||
|
||||
if (boxSize == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
|
||||
|
||||
uint32_t stride32bit = aIntegralImageStride / 4;
|
||||
int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
|
||||
|
||||
GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
|
||||
aIntegralImage, aIntegralImageStride, mData,
|
||||
mStride, size);
|
||||
|
||||
__m128i divisor = _mm_set1_epi32(reciprocal);
|
||||
__m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
|
||||
|
||||
// This points to the start of the rectangle within the IntegralImage that overlaps
|
||||
// the surface being blurred.
|
||||
uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
|
||||
|
||||
IntRect skipRect = mSkipRect;
|
||||
int32_t stride = mStride;
|
||||
uint8_t *data = mData;
|
||||
for (int32_t y = 0; y < size.height; y++) {
|
||||
bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
|
||||
|
||||
uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
|
||||
uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
|
||||
uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
|
||||
uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
|
||||
|
||||
for (int32_t x = 0; x < size.width; x += 4) {
|
||||
if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
|
||||
x = skipRect.XMost() - 4;
|
||||
// Trigger early jump on coming loop iterations, this will be reset
|
||||
// next line anyway.
|
||||
inSkipRectY = false;
|
||||
continue;
|
||||
}
|
||||
__m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
|
||||
__m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
|
||||
__m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
|
||||
__m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
|
||||
|
||||
__m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(bottomRight, topRight), bottomLeft), topLeft);
|
||||
|
||||
*(uint32_t*)(data + stride * y + x) = DivideAndPack(values, divisor, mask);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -6,8 +6,7 @@
|
||||
#include "ImageScaling.h"
|
||||
#include "mozilla/Attributes.h"
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include "SSEHelpers.h"
|
||||
|
||||
/* The functions below use the following system for averaging 4 pixels:
|
||||
*
|
||||
@ -108,17 +107,6 @@ MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
|
||||
return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
|
||||
}
|
||||
|
||||
/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
|
||||
* faster. Once enough people are on architectures where _mm_loadu_si128 is
|
||||
* fast we can migrate to it.
|
||||
*/
|
||||
MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
|
||||
{
|
||||
// Yes! We use uninitialized memory here, we'll overwrite it though!
|
||||
__m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
|
||||
return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
|
||||
{
|
||||
uint32_t sum = a ^ b ^ c;
|
||||
|
@ -116,7 +116,10 @@ endif
|
||||
ifneq (,$(INTEL_ARCHITECTURE))
|
||||
# VC2005 doesn't support _mm_castsi128_ps, so SSE2 is turned off
|
||||
ifneq (1400,$(_MSC_VER))
|
||||
CPPSRCS += ImageScalingSSE2.cpp
|
||||
CPPSRCS += \
|
||||
ImageScalingSSE2.cpp \
|
||||
BlurSSE2.cpp \
|
||||
$(NULL)
|
||||
DEFINES += -DUSE_SSE2
|
||||
endif
|
||||
endif
|
||||
@ -161,10 +164,12 @@ DEFINES := $(filter-out -DUNICODE -D_UNICODE,$(DEFINES))
|
||||
ifneq (,$(INTEL_ARCHITECTURE))
|
||||
ifdef GNU_CC
|
||||
ImageScalingSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
|
||||
BlurSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
|
||||
endif
|
||||
|
||||
ifdef SOLARIS_SUNPRO_CXX
|
||||
ImageScalingSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
|
||||
BlurSSE2.$(OBJ_SUFFIX): OS_CXXFLAGS += -xarch=sse2 -xO4
|
||||
endif
|
||||
endif
|
||||
|
||||
|
17
gfx/2d/SSEHelpers.h
Normal file
17
gfx/2d/SSEHelpers.h
Normal file
@ -0,0 +1,17 @@
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
/* Before Nehalem _mm_loadu_si128 could be very slow, this trick is a little
|
||||
* faster. Once enough people are on architectures where _mm_loadu_si128 is
|
||||
* fast we can migrate to it.
|
||||
*/
|
||||
MOZ_ALWAYS_INLINE __m128i loadUnaligned128(const __m128i *aSource)
|
||||
{
|
||||
// Yes! We use uninitialized memory here, we'll overwrite it though!
|
||||
__m128 res = _mm_loadl_pi(_mm_set1_ps(0), (const __m64*)aSource);
|
||||
return _mm_castps_si128(_mm_loadh_pi(res, ((const __m64*)(aSource)) + 1));
|
||||
}
|
@ -81,6 +81,64 @@ BytesPerPixel(SurfaceFormat aFormat)
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, int alignment = 16>
|
||||
struct AlignedArray
|
||||
{
|
||||
AlignedArray()
|
||||
: mStorage(nullptr)
|
||||
, mPtr(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE AlignedArray(size_t aSize)
|
||||
: mStorage(nullptr)
|
||||
{
|
||||
Realloc(aSize);
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE ~AlignedArray()
|
||||
{
|
||||
delete [] mStorage;
|
||||
}
|
||||
|
||||
void Dealloc()
|
||||
{
|
||||
delete [] mStorage;
|
||||
mStorage = mPtr = nullptr;
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE void Realloc(size_t aSize)
|
||||
{
|
||||
delete [] mStorage;
|
||||
mStorage = new T[aSize + (alignment - 1)];
|
||||
if (uintptr_t(mStorage) % alignment) {
|
||||
// Our storage does not start at a <alignment>-byte boundary. Make sure mData does!
|
||||
mPtr = (uint32_t*)(uintptr_t(mStorage) +
|
||||
(alignment - (uintptr_t(mStorage) % alignment)));
|
||||
} else {
|
||||
mPtr = mStorage;
|
||||
}
|
||||
}
|
||||
|
||||
MOZ_ALWAYS_INLINE operator T*()
|
||||
{
|
||||
return mPtr;
|
||||
}
|
||||
|
||||
T *mStorage;
|
||||
T *mPtr;
|
||||
};
|
||||
|
||||
template<int alignment>
|
||||
int32_t GetAlignedStride(int32_t aStride)
|
||||
{
|
||||
if (aStride % alignment) {
|
||||
return aStride + (alignment - (aStride % alignment));
|
||||
}
|
||||
|
||||
return aStride;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user