mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-12-13 18:27:35 +00:00
Bug 439375. Improve Gaussian blur inner loop. r=longsonr,sr=mats
This commit is contained in:
parent
df615121b9
commit
29161b5c48
@ -546,16 +546,6 @@ private:
|
||||
nsresult GetDXY(PRUint32 *aDX, PRUint32 *aDY, const nsSVGFilterInstance& aInstance);
|
||||
void InflateRectForBlur(nsRect* aRect, const nsSVGFilterInstance& aInstance);
|
||||
|
||||
PRUint8 *SetupPredivide(PRUint32 size) const;
|
||||
|
||||
void BoxBlurH(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
PRInt32 aStride, const nsRect& aRegion,
|
||||
PRUint32 leftLobe, PRUint32 rightLobe, const PRUint8 *prediv);
|
||||
|
||||
void BoxBlurV(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
PRInt32 aStride, const nsRect& aRegion,
|
||||
PRUint32 topLobe, PRUint32 bottomLobe, const PRUint8 *prediv);
|
||||
|
||||
void GaussianBlur(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
nsSVGFilterResource *aFilterResource,
|
||||
PRUint32 aDX, PRUint32 aDY);
|
||||
@ -628,101 +618,110 @@ nsSVGFEGaussianBlurElement::SetStdDeviation(float stdDeviationX, float stdDeviat
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
void
|
||||
nsSVGFEGaussianBlurElement::BoxBlurH(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
PRInt32 aStride, const nsRect &aRegion,
|
||||
PRUint32 leftLobe, PRUint32 rightLobe,
|
||||
const PRUint8 *prediv)
|
||||
/**
|
||||
* We want to speed up 1/N integer divisions --- integer division is
|
||||
* often rather slow.
|
||||
* We know that our input numerators V are constrained to be <= 255*N,
|
||||
* so the result of dividing by N always fits in 8 bits.
|
||||
* So we can try approximating the division V/N as V*K/(2^24) (integer
|
||||
* division, 32-bit multiply). Dividing by 2^24 is a simple shift so it's
|
||||
* fast. The main problem is choosing a value for K; this function returns
|
||||
* K's value.
|
||||
*
|
||||
* If the result is correct for the extrema, V=0 and V=255*N, then we'll
|
||||
* be in good shape since both the original function and our approximation
|
||||
* are linear. V=0 always gives 0 in both cases, no problem there.
|
||||
* For V=255*N, let's choose the largest K that doesn't cause overflow
|
||||
* and ensure that it gives the right answer. The constraints are
|
||||
* (1) 255*N*K < 2^32
|
||||
* and (2) 255*N*K >= 255*(2^24)
|
||||
*
|
||||
* From (1) we find the best value of K is floor((2^32 - 1)/(255*N)).
|
||||
* (2) tells us when this will be valid:
|
||||
* N*floor((2^32 - 1)/(255*N)) >= 2^24
|
||||
* Now, floor(X) > X - 1, so (2) holds if
|
||||
* N*((2^32 - 1)/(255*N) - 1) >= 2^24
|
||||
* (2^32 - 1)/255 - 2^24 >= N
|
||||
* N <= 65793
|
||||
*
|
||||
* If all that math confuses you, this should convince you:
|
||||
* > perl -e 'for($N=1;(255*$N*int(0xFFFFFFFF/(255*$N)))>>24==255;++$N){}print"$N\n"'
|
||||
* 66052
|
||||
*
|
||||
* So this is fine for all reasonable values of N. For larger values of N
|
||||
* we may as well just use the same approximation and accept the fact that
|
||||
* the output channel values will be a little low.
|
||||
*/
|
||||
static PRUint32 ComputeScaledDivisor(PRUint32 aDivisor)
|
||||
{
|
||||
PRInt32 boxSize = leftLobe + rightLobe + 1;
|
||||
PRInt32 posStart = aRegion.x - leftLobe;
|
||||
|
||||
for (PRInt32 y = aRegion.y; y < aRegion.YMost(); y++) {
|
||||
PRUint32 sums[4] = {0, 0, 0, 0};
|
||||
PRInt32 lineIndex = aStride * y;
|
||||
for (PRInt32 i = 0; i < boxSize; i++) {
|
||||
PRInt32 pos = posStart + i;
|
||||
pos = PR_MAX(pos, aRegion.x);
|
||||
pos = PR_MIN(pos, aRegion.XMost() - 1);
|
||||
PRInt32 index = lineIndex + (pos << 2);
|
||||
sums[0] += aInput[index ];
|
||||
sums[1] += aInput[index + 1];
|
||||
sums[2] += aInput[index + 2];
|
||||
sums[3] += aInput[index + 3];
|
||||
}
|
||||
for (PRInt32 x = aRegion.x; x < aRegion.XMost(); x++) {
|
||||
PRInt32 index = lineIndex + (x << 2);
|
||||
aOutput[index ] = prediv[sums[0]];
|
||||
aOutput[index + 1] = prediv[sums[1]];
|
||||
aOutput[index + 2] = prediv[sums[2]];
|
||||
aOutput[index + 3] = prediv[sums[3]];
|
||||
|
||||
PRInt32 tmp = x - leftLobe;
|
||||
PRInt32 last = PR_MAX(tmp, aRegion.x);
|
||||
PRInt32 next = PR_MIN(tmp + boxSize, aRegion.XMost() - 1);
|
||||
PRInt32 index2 = lineIndex + (next << 2);
|
||||
PRInt32 index3 = lineIndex + (last << 2);
|
||||
sums[0] += aInput[index2 ] - aInput[index3 ];
|
||||
sums[1] += aInput[index2 + 1] - aInput[index3 + 1];
|
||||
sums[2] += aInput[index2 + 2] - aInput[index3 + 2];
|
||||
sums[3] += aInput[index2 + 3] - aInput[index3 + 3];
|
||||
}
|
||||
}
|
||||
return PR_UINT32_MAX/(255*aDivisor);
|
||||
}
|
||||
|
||||
void
|
||||
nsSVGFEGaussianBlurElement::BoxBlurV(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
PRInt32 aStride, const nsRect &aRegion,
|
||||
PRUint32 topLobe, PRUint32 bottomLobe,
|
||||
const PRUint8 *prediv)
|
||||
|
||||
static void
|
||||
BoxBlur(const PRUint8 *aInput, PRUint8 *aOutput,
|
||||
PRInt32 aStrideMinor, PRInt32 aStartMinor, PRInt32 aEndMinor,
|
||||
PRUint32 aLeftLobe, PRUint32 aRightLobe)
|
||||
{
|
||||
PRInt32 boxSize = topLobe + bottomLobe + 1;
|
||||
PRInt32 posStart = aRegion.y - topLobe;
|
||||
PRUint32 boxSize = aLeftLobe + aRightLobe + 1;
|
||||
PRUint32 scaledDivisor = ComputeScaledDivisor(boxSize);
|
||||
PRUint32 sums[4] = {0, 0, 0, 0};
|
||||
|
||||
for (PRInt32 x = aRegion.x; x < aRegion.XMost(); x++) {
|
||||
PRUint32 sums[4] = {0, 0, 0, 0};
|
||||
PRInt32 fourX = x << 2;
|
||||
for (PRInt32 i = 0; i < boxSize; i++) {
|
||||
PRInt32 pos = posStart + i;
|
||||
pos = PR_MAX(pos, aRegion.y);
|
||||
pos = PR_MIN(pos, aRegion.YMost() - 1);
|
||||
PRInt32 index = aStride * pos + fourX;
|
||||
sums[0] += aInput[index ];
|
||||
sums[1] += aInput[index + 1];
|
||||
sums[2] += aInput[index + 2];
|
||||
sums[3] += aInput[index + 3];
|
||||
for (PRUint32 i=0; i < boxSize; i++) {
|
||||
PRInt32 pos = aStartMinor - aLeftLobe + i;
|
||||
pos = PR_MAX(pos, aStartMinor);
|
||||
pos = PR_MIN(pos, aEndMinor - 1);
|
||||
#define SUM(j) sums[j] += aInput[aStrideMinor*pos + j];
|
||||
SUM(0); SUM(1); SUM(2); SUM(3);
|
||||
#undef SUM
|
||||
}
|
||||
|
||||
aOutput += aStrideMinor*aStartMinor;
|
||||
if (aStartMinor + boxSize <= aEndMinor) {
|
||||
const PRUint8 *lastInput = aInput + aStartMinor*aStrideMinor;
|
||||
const PRUint8 *nextInput = aInput + (aStartMinor + aRightLobe + 1)*aStrideMinor;
|
||||
#define OUTPUT(j) aOutput[j] = (sums[j]*scaledDivisor) >> 24;
|
||||
#define SUM(j) sums[j] += nextInput[j] - lastInput[j];
|
||||
for (PRInt32 minor = aStartMinor; minor < aStartMinor + aLeftLobe; minor++) {
|
||||
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
|
||||
SUM(0); SUM(1); SUM(2); SUM(3);
|
||||
nextInput += aStrideMinor;
|
||||
aOutput += aStrideMinor;
|
||||
}
|
||||
for (PRInt32 minor = aStartMinor + aLeftLobe; minor < aEndMinor - aRightLobe - 1; minor++) {
|
||||
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
|
||||
SUM(0); SUM(1); SUM(2); SUM(3);
|
||||
lastInput += aStrideMinor;
|
||||
nextInput += aStrideMinor;
|
||||
aOutput += aStrideMinor;
|
||||
}
|
||||
for (PRInt32 y = aRegion.y; y < aRegion.YMost(); y++) {
|
||||
PRInt32 index = aStride * y + fourX;
|
||||
aOutput[index ] = prediv[sums[0]];
|
||||
aOutput[index + 1] = prediv[sums[1]];
|
||||
aOutput[index + 2] = prediv[sums[2]];
|
||||
aOutput[index + 3] = prediv[sums[3]];
|
||||
// nextInput is now aInput + aEndMinor*aStrideMinor. Set it back to
|
||||
// aInput + (aEndMinor - 1)*aStrideMinor so we read the last pixel in every
|
||||
// iteration of the next loop.
|
||||
nextInput -= aStrideMinor;
|
||||
for (PRInt32 minor = aEndMinor - aRightLobe - 1; minor < aEndMinor; minor++) {
|
||||
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
|
||||
SUM(0); SUM(1); SUM(2); SUM(3);
|
||||
lastInput += aStrideMinor;
|
||||
aOutput += aStrideMinor;
|
||||
#undef SUM
|
||||
}
|
||||
} else {
|
||||
for (PRInt32 minor = aStartMinor; minor < aEndMinor; minor++) {
|
||||
PRInt32 tmp = minor - aLeftLobe;
|
||||
PRInt32 last = PR_MAX(tmp, aStartMinor);
|
||||
PRInt32 next = PR_MIN(tmp + boxSize, aEndMinor - 1);
|
||||
|
||||
PRInt32 tmp = y - topLobe;
|
||||
PRInt32 last = PR_MAX(tmp, aRegion.y);
|
||||
PRInt32 next = PR_MIN(tmp + boxSize, aRegion.YMost() - 1);
|
||||
PRInt32 index2 = aStride * next + fourX;
|
||||
PRInt32 index3 = aStride * last + fourX;
|
||||
sums[0] += aInput[index2 ] - aInput[index3 ];
|
||||
sums[1] += aInput[index2 + 1] - aInput[index3 + 1];
|
||||
sums[2] += aInput[index2 + 2] - aInput[index3 + 2];
|
||||
sums[3] += aInput[index2 + 3] - aInput[index3 + 3];
|
||||
OUTPUT(0); OUTPUT(1); OUTPUT(2); OUTPUT(3);
|
||||
#define SUM(j) sums[j] += aInput[aStrideMinor*next + j] - \
|
||||
aInput[aStrideMinor*last + j];
|
||||
SUM(0); SUM(1); SUM(2); SUM(3);
|
||||
aOutput += aStrideMinor;
|
||||
#undef SUM
|
||||
#undef OUTPUT
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PRUint8 *
|
||||
nsSVGFEGaussianBlurElement::SetupPredivide(PRUint32 size) const
|
||||
{
|
||||
PRUint8 *tmp = new PRUint8[size * 256];
|
||||
if (tmp) {
|
||||
for (PRUint32 i = 0; i < 256; i++)
|
||||
memset(tmp + i * size, i, size);
|
||||
}
|
||||
return tmp;
|
||||
}
|
||||
|
||||
nsresult
|
||||
nsSVGFEGaussianBlurElement::GetDXY(PRUint32 *aDX, PRUint32 *aDY,
|
||||
const nsSVGFilterInstance& aInstance)
|
||||
@ -753,12 +752,11 @@ nsSVGFEGaussianBlurElement::GaussianBlur(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
nsSVGFilterResource *aFilterResource,
|
||||
PRUint32 aDX, PRUint32 aDY)
|
||||
{
|
||||
NS_ASSERTION(aDX > 0 && aDY > 0, "Invalid stdDeviation!");
|
||||
|
||||
nsAutoArrayPtr<PRUint8> tmp(new PRUint8[aFilterResource->GetDataSize()]);
|
||||
if (!tmp)
|
||||
return;
|
||||
memset(tmp, 0, aFilterResource->GetDataSize());
|
||||
|
||||
nsRect rect = aFilterResource->GetSurfaceRect();
|
||||
#ifdef DEBUG_tor
|
||||
fprintf(stderr, "FILTER GAUSS rect: %d,%d %dx%d\n",
|
||||
@ -767,53 +765,29 @@ nsSVGFEGaussianBlurElement::GaussianBlur(PRUint8 *aInput, PRUint8 *aOutput,
|
||||
|
||||
PRUint32 stride = aFilterResource->GetDataStride();
|
||||
|
||||
if (aDX & 1) {
|
||||
// odd
|
||||
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDX / 2) + 1));
|
||||
if (!prediv)
|
||||
return;
|
||||
|
||||
BoxBlurH(aInput, tmp, stride, rect, aDX/2, aDX/2, prediv);
|
||||
BoxBlurH(tmp, aOutput, stride, rect, aDX/2, aDX/2, prediv);
|
||||
BoxBlurH(aOutput, tmp, stride, rect, aDX/2, aDX/2, prediv);
|
||||
if (aDX == 0) {
|
||||
aFilterResource->CopyImageSubregion(tmp, aInput);
|
||||
} else {
|
||||
// even
|
||||
if (aDX == 0) {
|
||||
aFilterResource->CopyImageSubregion(tmp, aInput);
|
||||
} else {
|
||||
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDX / 2) + 1));
|
||||
nsAutoArrayPtr<PRUint8> prediv2(SetupPredivide(2 * (aDX / 2)));
|
||||
if (!prediv || !prediv2)
|
||||
return;
|
||||
|
||||
BoxBlurH(aInput, tmp, stride, rect, aDX/2, aDX/2 - 1, prediv2);
|
||||
BoxBlurH(tmp, aOutput, stride, rect, aDX/2 - 1, aDX/2, prediv2);
|
||||
BoxBlurH(aOutput, tmp, stride, rect, aDX/2, aDX/2, prediv);
|
||||
PRInt32 longLobe = aDX/2;
|
||||
PRInt32 shortLobe = (aDX & 1) ? longLobe : longLobe - 1;
|
||||
for (PRInt32 major = rect.y; major < rect.YMost(); ++major) {
|
||||
PRInt32 ms = major*stride;
|
||||
BoxBlur(aInput + ms, tmp + ms, 4, rect.x, rect.XMost(), longLobe, shortLobe);
|
||||
BoxBlur(tmp + ms, aOutput + ms, 4, rect.x, rect.XMost(), shortLobe, longLobe);
|
||||
BoxBlur(aOutput + ms, tmp + ms, 4, rect.x, rect.XMost(), longLobe, longLobe);
|
||||
}
|
||||
}
|
||||
|
||||
if (aDY & 1) {
|
||||
// odd
|
||||
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDY / 2) + 1));
|
||||
if (!prediv)
|
||||
return;
|
||||
|
||||
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2, prediv);
|
||||
BoxBlurV(aOutput, tmp, stride, rect, aDY/2, aDY/2, prediv);
|
||||
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2, prediv);
|
||||
if (aDY == 0) {
|
||||
aFilterResource->CopyImageSubregion(aOutput, tmp);
|
||||
} else {
|
||||
// even
|
||||
if (aDY == 0) {
|
||||
aFilterResource->CopyImageSubregion(aOutput, tmp);
|
||||
} else {
|
||||
nsAutoArrayPtr<PRUint8> prediv(SetupPredivide(2 * (aDY / 2) + 1));
|
||||
nsAutoArrayPtr<PRUint8> prediv2(SetupPredivide(2 * (aDY / 2)));
|
||||
if (!prediv || !prediv2)
|
||||
return;
|
||||
|
||||
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2 - 1, prediv2);
|
||||
BoxBlurV(aOutput, tmp, stride, rect, aDY/2 - 1, aDY/2, prediv2);
|
||||
BoxBlurV(tmp, aOutput, stride, rect, aDY/2, aDY/2, prediv);
|
||||
PRInt32 longLobe = aDY/2;
|
||||
PRInt32 shortLobe = (aDY & 1) ? longLobe : longLobe - 1;
|
||||
for (PRInt32 major = rect.x; major < rect.XMost(); ++major) {
|
||||
PRInt32 ms = major*4;
|
||||
BoxBlur(tmp + ms, aOutput + ms, stride, rect.y, rect.YMost(), longLobe, shortLobe);
|
||||
BoxBlur(aOutput + ms, tmp + ms, stride, rect.y, rect.YMost(), shortLobe, longLobe);
|
||||
BoxBlur(tmp + ms, aOutput + ms, stride, rect.y, rect.YMost(), longLobe, longLobe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user