Bug 439375. Improve Gaussian blur inner loop. r=longsonr,sr=mats

This commit is contained in:
Robert O'Callahan 2008-06-26 10:12:12 +12:00
parent df615121b9
commit 29161b5c48

View File

@ -546,16 +546,6 @@ private:
nsresult GetDXY(PRUint32 *aDX, PRUint32 *aDY, const nsSVGFilterInstance& aInstance);
void InflateRectForBlur(nsRect* aRect, const nsSVGFilterInstance& aInstance);
PRUint8 *SetupPredivide(PRUint32 size) const;
void BoxBlurH(PRUint8 *aInput, PRUint8 *aOutput,
PRInt32 aStride, const nsRect& aRegion,
PRUint32 leftLobe, PRUint32 rightLobe, const PRUint8 *prediv);
void BoxBlurV(PRUint8 *aInput, PRUint8 *aOutput,
PRInt32 aStride, const nsRect& aRegion,
PRUint32 topLobe, PRUint32 bottomLobe, const PRUint8 *prediv);
void GaussianBlur(PRUint8 *aInput, PRUint8 *aOutput,
nsSVGFilterResource *aFilterResource,
PRUint32 aDX, PRUint32 aDY);
@ -628,101 +618,110 @@ nsSVGFEGaussianBlurElement::SetStdDeviation(float stdDeviationX, float stdDeviat
return NS_OK;
}
void
nsSVGFEGaussianBlurElement::BoxBlurH(PRUint8 *aInput, PRUint8 *aOutput,
PRInt32 aStride, const nsRect &aRegion,
PRUint32 leftLobe, PRUint32 rightLobe,
const PRUint8 *prediv)
/**
* We want to speed up 1/N integer divisions --- integer division is
* often rather slow.
* We know that our input numerators V are constrained to be <= 255*N,
* so the result of dividing by N always fits in 8 bits.
* So we can try approximating the division V/N as V*K/(2^24) (integer
* division, 32-bit multiply). Dividing by 2^24 is a simple shift so it's
* fast. The main problem is choosing a value for K; this function returns
* K's value.
*
* If the result is correct for the extrema, V=0 and V=255*N, then we'll
* be in good shape since both the original function and our approximation
* are linear. V=0 always gives 0 in both cases, no problem there.
* For V=255*N, let's choose the largest K that doesn't cause overflow
* and ensure that it gives the right answer. The constraints are
* (1) 255*N*K < 2^32
* and (2) 255*N*K >= 255*(2^24)
*
* From (1) we find the best value of K is floor((2^32 - 1)/(255*N)).
* (2) tells us when this will be valid:
* N*floor((2^32 - 1)/(255*N)) >= 2^24
* Now, floor(X) > X - 1, so (2) holds if
* N*((2^32 - 1)/(255*N) - 1) >= 2^24
* (2^32 - 1)/255 - 2^24 >= N
* N <= 65793
*
* If all that math confuses you, this should convince you:
* > perl -e 'for($N=1;(255*$N*int(0xFFFFFFFF/(255*$N)))>>24==255;++$N){}print"$N\n"'
* 66052
*
* So this is fine for all reasonable values of N. For larger values of N
* we may as well just use the same approximation and accept the fact that
* the output channel values will be a little low.
*/
static PRUint32 ComputeScaledDivisor(PRUint32 aDivisor)
{
PRInt32 boxSize = leftLobe + rightLobe + 1;
PRInt32 posStart = aRegion.x - leftLobe;
for (PRInt32 y = aRegion.y; y < aRegion.YMost(); y++) {
PRUint32 sums[4] = {0, 0, 0, 0};
PRInt32 lineIndex = aStride * y;
for (PRInt32 i = 0; i < boxSize; i++) {
PRInt32 pos = posStart + i;
pos = PR_MAX(pos, aRegion.x);
pos = PR_MIN(pos, aRegion.XMost() - 1);
PRInt32 index = lineIndex + (pos << 2);
sums[0] += aInput[index ];
sums[1] += aInput[index + 1];
sums[2] += aInput[index + 2];
sums[3] += aInput[index + 3];
}
for (PRInt32 x = aRegion.x; x < aRegion.XMost(); x++) {
PRInt32 index = lineIndex + (x << 2);
aOutput[index ] = prediv[sums[0]];
aOutput[index + 1] = prediv[sums[1]];
aOutput[index + 2] = prediv[sums[2]];
aOutput[index + 3] = prediv[sums[3]];
PRInt32 tmp = x - leftLobe;
PRInt32 last = PR_MAX(tmp, aRegion.x);
PRInt32 next = PR_MIN(tmp + boxSize, aRegion.XMost() - 1);
PRInt32 index2 = lineIndex + (next << 2);
PRInt32 index3 = lineIndex + (last << 2);
sums[0] += aInput[index2 ] - aInput[index3 ];
sums[1] += aInput[index2 + 1] - aInput[index3 + 1];
sums[2] += aInput[index2 + 2] - aInput[index3 + 2];
sums[3] += aInput[index2 + 3] - aInput[index3 + 3];
}
}
return PR_UINT32_MAX/(255*aDivisor);
}
void
nsSVGFEGaussianBlurElement::BoxBlurV(PRUint8 *aInput, PRUint8 *aOutput,
PRInt32 aStride, const nsRect &aRegion,
PRUint32 topLobe, PRUint32 bottomLobe,
const PRUint8 *prediv)
static void
BoxBlur(const PRUint8 *aInput, PRUint8 *aOutput,
PRInt32 aStrideMinor, PRInt32 aStartMinor, PRInt32 aEndMinor,
PRUint32 aLeftLobe, PRUint32 aRightLobe)
{
PRInt32 boxSize = topLobe + bottomLobe + 1;
PRInt32 posStart = aRegion.y - topLobe;
PRUint32 boxSize = aLeftLobe + aRightLobe + 1;
PRUint32 scaledDivisor = ComputeScaledDivisor(boxSize);
PRUint32 sums[4] = {0, 0, 0, 0};
for (PRInt32 x = aRegion.x; x < aRegion.XMost(); x++) {
PRUint32 sums[4] = {0, 0, 0, 0};
PRInt32 fourX = x << 2;
for (PRInt32 i = 0; i < boxSize; i++) {
PRInt32 pos = posStart + i;
pos = PR_MAX(pos, aRegion.y);
pos = PR_MIN(pos, aRegion.YMost() - 1);
PRInt32 index = aStride * pos + fourX;
sums[0] += aInput[index ];
sums[1] += aInput[index + 1];
sums[2] += aInput[index + 2];
sums[3] += aInput[index + 3];
for (PRUint32 i=0; i < boxSize; i++) {
PRInt32 pos = aStartMinor - aLeftLobe + i;
pos = PR_MAX(pos, aStartMinor);
pos = PR_MIN(pos, aEndMinor - 1);
#define SUM(j) sums[j] += aInput[aStrideMinor*pos + j];
SUM(0); SUM(1); SUM(2); SUM(3);
#undef SUM
}
aOutput += aStrideMinor*aStartMinor;
if (aStartMinor + boxSize <= aEndMinor) {
const PRUint8 *lastInput = aInput + aStartMinor*aStrideMinor;
const PRUint8 *nextInput = aInput + (aStartMinor + aRightLobe + 1)*aStrideMinor;
#define OUTPUT(j) aOutput[j] = (sums[j]*scaledDivisor) >> 24;
#define SUM(j) sums[j] += nextInput[j] - lastInput[j];
for (PRInt32 minor = aStartMinor; minor < aStartMinor + aLeftLobe; minor++) {
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
SUM(0); SUM(1); SUM(2); SUM(3);
nextInput += aStrideMinor;
aOutput += aStrideMinor;
}
for (PRInt32 minor = aStartMinor + aLeftLobe; minor < aEndMinor - aRightLobe - 1; minor++) {
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
SUM(0); SUM(1); SUM(2); SUM(3);
lastInput += aStrideMinor;
nextInput += aStrideMinor;
aOutput += aStrideMinor;
}
for (PRInt32 y = aRegion.y; y < aRegion.YMost(); y++) {
PRInt32 index = aStride * y + fourX;
aOutput[index ] = prediv[sums[0]];
aOutput[index + 1] = prediv[sums[1]];
aOutput[index + 2] = prediv[sums[2]];
aOutput[index + 3] = prediv[sums[3]];
// nextInput is now aInput + aEndMinor*aStrideMinor. Set it back to
// aInput + (aEndMinor - 1)*aStrideMinor so we read the last pixel in every
// iteration of the next loop.
nextInput -= aStrideMinor;
for (PRInt32 minor = aEndMinor - aRightLobe - 1; minor < aEndMinor; minor++) {
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
SUM(0); SUM(1); SUM(2); SUM(3);
lastInput += aStrideMinor;
aOutput += aStrideMinor;
#undef SUM
}
} else {
for (PRInt32 minor = aStartMinor; minor < aEndMinor; minor++) {
PRInt32 tmp = minor - aLeftLobe;
PRInt32 last = PR_MAX(tmp, aStartMinor);
PRInt32 next = PR_MIN(tmp + boxSize, aEndMinor - 1);
PRInt32 tmp = y - topLobe;
PRInt32 last = PR_MAX(tmp, aRegion.y);
PRInt32 next = PR_MIN(tmp + boxSize, aRegion.YMost() - 1);
PRInt32 index2 = aStride * next + fourX;
PRInt32 index3 = aStride * last + fourX;
sums[0] += aInput[index2 ] - aInput[index3 ];
sums[1] += aInput[index2 + 1] - aInput[index3 + 1];
sums[2] += aInput[index2 + 2] - aInput[index3 + 2];
sums[3] += aInput[index2 + 3] - aInput[index3 + 3];
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
#define SUM(j) sums[j] += aInput[aStrideMinor*next + j] - \
aInput[aStrideMinor*last + j];
SUM(0); SUM(1); SUM(2); SUM(3);
aOutput += aStrideMinor;
#undef SUM
#undef OUTPUT
}
}
}
PRUint8 *
nsSVGFEGaussianBlurElement::SetupPredivide(PRUint32 size) const
{
PRUint8 *tmp = new PRUint8[size * 256];
if (tmp) {
for (PRUint32 i = 0; i < 256; i++)
memset(tmp + i * size, i, size);
}
return tmp;
}
nsresult
nsSVGFEGaussianBlurElement::GetDXY(PRUint32 *aDX, PRUint32 *aDY,
const nsSVGFilterInstance& aInstance)
@ -753,12 +752,11 @@ nsSVGFEGaussianBlurElement::GaussianBlur(PRUint8 *aInput, PRUint8 *aOutput,
nsSVGFilterResource *aFilterResource,
PRUint32 aDX, PRUint32 aDY)
{
NS_ASSERTION(aDX > 0 && aDY > 0, "Invalid stdDeviation!");
nsAutoArrayPtr<PRUint8> tmp(new PRUint8[aFilterResource->GetDataSize()]);
if (!tmp)
return;
memset(tmp, 0, aFilterResource->GetDataSize());
nsRect rect = aFilterResource->GetSurfaceRect();
#ifdef DEBUG_tor
fprintf(stderr, "FILTER GAUSS rect: %d,%d %dx%d\n",
@ -767,53 +765,29 @@ nsSVGFEGaussianBlurElement::GaussianBlur(PRUint8 *aInput, PRUint8 *aOutput,
PRUint32 stride = aFilterResource->GetDataStride();
if (aDX & 1) {
// odd
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDX / 2) + 1));
if (!prediv)
return;
BoxBlurH(aInput, tmp, stride, rect, aDX/2, aDX/2, prediv);
BoxBlurH(tmp, aOutput, stride, rect, aDX/2, aDX/2, prediv);
BoxBlurH(aOutput, tmp, stride, rect, aDX/2, aDX/2, prediv);
if (aDX == 0) {
aFilterResource->CopyImageSubregion(tmp, aInput);
} else {
// even
if (aDX == 0) {
aFilterResource->CopyImageSubregion(tmp, aInput);
} else {
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDX / 2) + 1));
nsAutoArrayPtr<PRUint8> prediv2(SetupPredivide(2 * (aDX / 2)));
if (!prediv || !prediv2)
return;
BoxBlurH(aInput, tmp, stride, rect, aDX/2, aDX/2 - 1, prediv2);
BoxBlurH(tmp, aOutput, stride, rect, aDX/2 - 1, aDX/2, prediv2);
BoxBlurH(aOutput, tmp, stride, rect, aDX/2, aDX/2, prediv);
PRInt32 longLobe = aDX/2;
PRInt32 shortLobe = (aDX & 1) ? longLobe : longLobe - 1;
for (PRInt32 major = rect.y; major < rect.YMost(); ++major) {
PRInt32 ms = major*stride;
BoxBlur(aInput + ms, tmp + ms, 4, rect.x, rect.XMost(), longLobe, shortLobe);
BoxBlur(tmp + ms, aOutput + ms, 4, rect.x, rect.XMost(), shortLobe, longLobe);
BoxBlur(aOutput + ms, tmp + ms, 4, rect.x, rect.XMost(), longLobe, longLobe);
}
}
if (aDY & 1) {
// odd
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDY / 2) + 1));
if (!prediv)
return;
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2, prediv);
BoxBlurV(aOutput, tmp, stride, rect, aDY/2, aDY/2, prediv);
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2, prediv);
if (aDY == 0) {
aFilterResource->CopyImageSubregion(aOutput, tmp);
} else {
// even
if (aDY == 0) {
aFilterResource->CopyImageSubregion(aOutput, tmp);
} else {
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDY / 2) + 1));
nsAutoArrayPtr<PRUint8> prediv2(SetupPredivide(2 * (aDY / 2)));
if (!prediv || !prediv2)
return;
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2 - 1, prediv2);
BoxBlurV(aOutput, tmp, stride, rect, aDY/2 - 1, aDY/2, prediv2);
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2, prediv);
PRInt32 longLobe = aDY/2;
PRInt32 shortLobe = (aDY & 1) ? longLobe : longLobe - 1;
for (PRInt32 major = rect.x; major < rect.XMost(); ++major) {
PRInt32 ms = major*4;
BoxBlur(tmp + ms, aOutput + ms, stride, rect.y, rect.YMost(), longLobe, shortLobe);
BoxBlur(aOutput + ms, tmp + ms, stride, rect.y, rect.YMost(), shortLobe, longLobe);
BoxBlur(tmp + ms, aOutput + ms, stride, rect.y, rect.YMost(), longLobe, longLobe);
}
}
}