diff --git a/webtools/webstats/addstats.pl b/webtools/webstats/addstats.pl index 4846c9422a9e..7d1e00ef1b3a 100755 --- a/webtools/webstats/addstats.pl +++ b/webtools/webstats/addstats.pl @@ -96,7 +96,7 @@ while () { } chomp; - if (! m@\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\].*GET (/\S*) HTTP@) { + if (! m@\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\w+ (/\S*)@) { next; } diff --git a/webtools/webstats/addstats.sh b/webtools/webstats/addstats.sh new file mode 100755 index 000000000000..4156d0cc4723 --- /dev/null +++ b/webtools/webstats/addstats.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +# steal code from http log cruncher for ftp.mozilla.org. yes, its hacky +# but it works +# +# awk '{print $4,$7,$9}' # Grab the date, url and http status code +# grep 200$ # Only use the status code 200 (complete downloads) +# awk '{print $1}' # Remove http status code, leaving only the url +# sed "s/\?.*//" # Remove query strings +# these should be equivalent ('...blah/index.html' == '...blah/' == '...blah') +# sed "s/index.html//" # Remove trailing index.html so these entries group + # with urls without +# sed "s/\(\w\)\/$/\1/" # Remove ending slash on directories so they group + # together. Not all directories have them. But don't + # remove a lone slash. (the root url) + + +zcat $* | awk '{print $4, $7, $9}' | grep 200$ | awk '{print $1, $2}' | sed "s/index.html//" | sed "s/\?.*//" | sed "s/\(\w\)\/$/\1/" | ./addstats.pl