Bug #244357 --> Fix several underflow situations in the chi2 algorithm used for junk mail detection. Simplify the math to make it less computensive.

Patch origianlly by tenthumbs@cybernex.net

sr=mscott
This commit is contained in:
scott%scott-macgregor.org 2004-10-19 21:25:00 +00:00
parent 54c270b37c
commit aa62f1a883
2 changed files with 323 additions and 18 deletions

View File

@ -79,6 +79,7 @@
#include "nsIHTMLToTextSink.h"
#include "nsIDocumentEncoder.h"
#include "nsIncompleteGamma.h"
#include <math.h>
static PRLogModuleInfo *BayesianFilterLogModule = nsnull;
@ -923,21 +924,28 @@ PR_STATIC_CALLBACK(int) compareTokens(const void* p1, const void* p2, void* /* d
inline double dmax(double x, double y) { return (x > y ? x : y); }
inline double dmin(double x, double y) { return (x < y ? x : y); }
double chi2Q (double x2, PRUint32 v)
// Chi square functions are implemented by an incomplete gamma function.
// Note that chi2P's callers multiply the arguments by 2 but chi2P
// divides them by 2 again. Inlining chi2P gives the compiler a
// chance to notice this.
// Both chi2P and nsIncompleteGammaP set *error negative on domain
// errors and nsIncompleteGammaP sets it posivive on internal errors.
// This may be useful but the chi2P callers treat any error as fatal.
// Note that converting unsigned ints to floating point can be slow on
// some platforms (like Intel) so use signed quantities for the numeric
// routines.
static inline double chi2P (double chi2, double nu, PRInt32 *error)
{
PRUint32 i;
double m = x2 / 2.0;
double sum = exp(-m);
double term = sum;
NS_ASSERTION(!(v & 1), "chi2Q called with odd value");
for (i=1; i < v/2; ++i)
{
term *= m / i;
sum += term;
}
return dmin(sum,1.0);
// domain checks; set error and return a dummy value
if (chi2 < 0.0 || nu <= 0.0)
{
*error = -1;
return 0.0;
}
// reversing the arguments is intentional
return nsIncompleteGammaP (nu/2.0, chi2/2.0, error);
}
void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* messageURI,
@ -1037,12 +1045,18 @@ void nsBayesianFilter::classifyMessage(Tokenizer& tokenizer, const char* message
if (goodclues > 0)
{
S = 1.0 - chi2Q(-2.0 * S, 2 * goodclues);
H = 1.0 - chi2Q(-2.0 * H, 2 * goodclues);
prob = (S-H +1.0) / 2.0;
PRInt32 chi_error;
S = chi2P(-2.0 * S, 2.0 * goodclues, &chi_error);
if (!chi_error)
H = chi2P(-2.0 * H, 2.0 * goodclues, &chi_error);
// if any error toss the entire calculation
if (!chi_error)
prob = (S-H +1.0) / 2.0;
else
prob = 0.5;
}
else
prob = 0.5;
prob = 0.5;
PRBool isJunk = (prob >= mJunkProbabilityThreshold);
PR_LOG(BayesianFilterLogModule, PR_LOG_ALWAYS, ("%s is junk probability = (%f) HAM SCORE:%f SPAM SCORE:%f", messageURI, prob,H,S));

View File

@ -0,0 +1,291 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/***** BEGIN LICENSE BLOCK *****
Version: MPL 1.1/GPL 2.0/LGPL 2.1
The contents of this file are subject to the Mozilla Public License Version
1.1 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL/
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
for the specific language governing rights and limitations under the
License.
The Original Code is bayesian spam filter
Portions created by the Initial Developer are Copyright (C) 2004
the Initial Developer. All Rights Reserved.
Contributor(s):
Scott MacGregor <mscott@mozilla.org>
tenthumbs@cybernex.net
Alternatively, the contents of this file may be used under the terms of
either of the GNU General Public License Version 2 or later (the "GPL"),
or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
in which case the provisions of the GPL or the LGPL are applicable instead
of those above. If you wish to allow use of your version of this file only
under the terms of either the GPL or the LGPL, and not to allow others to
use your version of this file under the terms of the MPL, indicate your
decision by deleting the provisions above and replace them with the notice
and other provisions required by the GPL or the LGPL. If you do not delete
the provisions above, a recipient may use your version of this file under
the terms of any one of the MPL, the GPL or the LGPL.
***** END LICENSE BLOCK *****/
#ifndef nsIncompleteGamma_h__
#define nsIncompleteGamma_h__
/* An implementation of the incomplete gamma functions for real
arguments. P is defined as
x
/
1 [ a - 1 - t
P(a, x) = -------- I t e dt
Gamma(a) ]
/
0
and
infinity
/
1 [ a - 1 - t
Q(a, x) = -------- I t e dt
Gamma(a) ]
/
x
so that P(a,x) + Q(a,x) = 1.
Both a series expansion and a continued fraction exist. This
implementation uses the more efficient method based on the arguments.
Either case involves calculating a multiplicative term:
e^(-x)*x^a/Gamma(a).
Here we calculate the log of this term. Most math libraries have a
"lgamma" function but it is not re-entrant. Some libraries have a
"lgamma_r" which is re-entrant. Use it if possible. I have included a
simple replacement but it is certainly not as accurate.
Relative errors are almost always < 1e-10 and usually < 1e-14. Very
small and very large arguments cause trouble.
The region where a < 0.5 and x < 0.5 has poor error properties and is
not too stable. Get a better routine if you need results in this
region.
The error argument will be set negative if there is a domain error or
positive for an internal calculation error, currently lack of
convergence. A value is always returned, though.
*/
#include <math.h>
#include <float.h>
// the main routine
static double nsIncompleteGammaP (double a, double x, int *error);
// nsLnGamma(z): either a wrapper around lgamma_r or the internal function.
// C_m = B[2*m]/(2*m*(2*m-1)) where B is a Bernoulli number
static const double C_1 = 1.0 / 12.0;
static const double C_2 = -1.0 / 360.0;
static const double C_3 = 1.0 / 1260.0;
static const double C_4 = -1.0 / 1680.0;
static const double C_5 = 1.0 / 1188.0;
static const double C_6 = -691.0 / 360360.0;
static const double C_7 = 1.0 / 156.0;
static const double C_8 = -3617.0 / 122400.0;
static const double C_9 = 43867.0 / 244188.0;
static const double C_10 = -174611.0 / 125400.0;
static const double C_11 = 77683.0 / 5796.0;
// truncated asymptotic series in 1/z
static inline double lngamma_asymp (double z)
{
double w, w2, sum;
w = 1.0 / z;
w2 = w * w;
sum = w * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2 * (w2
* (C_11 * w2 + C_10) + C_9) + C_8) + C_7) + C_6)
+ C_5) + C_4) + C_3) + C_2) + C_1);
return sum;
}
struct fact_table_s
{
double fact;
double lnfact;
};
// for speed and accuracy
static const struct fact_table_s FactTable[] = {
{1.000000000000000, 0.0000000000000000000000e+00},
{1.000000000000000, 0.0000000000000000000000e+00},
{2.000000000000000, 6.9314718055994530942869e-01},
{6.000000000000000, 1.7917594692280550007892e+00},
{24.00000000000000, 3.1780538303479456197550e+00},
{120.0000000000000, 4.7874917427820459941458e+00},
{720.0000000000000, 6.5792512120101009952602e+00},
{5040.000000000000, 8.5251613610654142999881e+00},
{40320.00000000000, 1.0604602902745250228925e+01},
{362880.0000000000, 1.2801827480081469610995e+01},
{3628800.000000000, 1.5104412573075515295248e+01},
{39916800.00000000, 1.7502307845873885839769e+01},
{479001600.0000000, 1.9987214495661886149228e+01},
{6227020800.000000, 2.2552163853123422886104e+01},
{87178291200.00000, 2.5191221182738681499610e+01},
{1307674368000.000, 2.7899271383840891566988e+01},
{20922789888000.00, 3.0671860106080672803835e+01},
{355687428096000.0, 3.3505073450136888885825e+01},
{6402373705728000., 3.6395445208033053576674e+01}
};
#define FactTableLength (int)(sizeof(FactTable)/sizeof(FactTable[0]))
// for speed
static const double ln_2pi_2 = 0.918938533204672741803; // log(2*PI)/2
/* A simple lgamma function, not very robust.
Valid for z_in > 0 ONLY.
For z_in > 8 precision is quite good, relative errors < 1e-14 and
usually better. For z_in < 8 relative errors increase but are usually
< 1e-10. In two small regions, 1 +/- .001 and 2 +/- .001 errors
increase quickly.
*/
static double nsLnGamma (double z_in, int *gsign)
{
double scale, z, sum, result;
*gsign = 1;
int zi = (int) z_in;
if (z_in == (double) zi)
{
if (0 < zi && zi <= FactTableLength)
return FactTable[zi - 1].lnfact; // gamma(z) = (z-1)!
}
for (scale = 1.0, z = z_in; z < 8.0; ++z)
scale *= z;
sum = lngamma_asymp (z);
result = (z - 0.5) * log (z) - z + ln_2pi_2 - log (scale);
result += sum;
return result;
}
// log( e^(-x)*x^a/Gamma(a) )
static inline double lnPQfactor (double a, double x)
{
int gsign; // ignored because a > 0
return a * log (x) - x - nsLnGamma (a, &gsign);
}
static double Pseries (double a, double x, int *error)
{
double sum, term;
const double eps = 2.0 * DBL_EPSILON;
const int imax = 5000;
int i;
sum = term = 1.0 / a;
for (i = 1; i < imax; ++i)
{
term *= x / (a + i);
sum += term;
if (fabs (term) < eps * fabs (sum))
break;
}
if (i >= imax)
*error = 1;
return sum;
}
static double Qcontfrac (double a, double x, int *error)
{
double result, D, C, e, f, term;
const double eps = 2.0 * DBL_EPSILON;
const double small =
DBL_EPSILON * DBL_EPSILON * DBL_EPSILON * DBL_EPSILON;
const int imax = 5000;
int i;
// modified Lentz method
f = x - a + 1.0;
if (fabs (f) < small)
f = small;
C = f + 1.0 / small;
D = 1.0 / f;
result = D;
for (i = 1; i < imax; ++i)
{
e = i * (a - i);
f += 2.0;
D = f + e * D;
if (fabs (D) < small)
D = small;
D = 1.0 / D;
C = f + e / C;
if (fabs (C) < small)
C = small;
term = C * D;
result *= term;
if (fabs (term - 1.0) < eps)
break;
}
if (i >= imax)
*error = 1;
return result;
}
static double nsIncompleteGammaP (double a, double x, int *error)
{
double result, dom, ldom;
// domain errors. the return values are meaningless but have
// to return something.
*error = -1;
if (a <= 0.0)
return 1.0;
if (x < 0.0)
return 0.0;
*error = 0;
if (x == 0.0)
return 0.0;
ldom = lnPQfactor (a, x);
dom = exp (ldom);
// might need to adjust the crossover point
if (a <= 0.5)
{
if (x < a + 1.0)
result = dom * Pseries (a, x, error);
else
result = 1.0 - dom * Qcontfrac (a, x, error);
}
else
{
if (x < a)
result = dom * Pseries (a, x, error);
else
result = 1.0 - dom * Qcontfrac (a, x, error);
}
// not clear if this can ever happen
if (result > 1.0)
result = 1.0;
if (result < 0.0)
result = 0.0;
return result;
}
#endif