weed out cruft. two years of data contained 5 million urls most of which were various bonsai and htdig queries. This is insane. Crop off everything in the url after a question mark. Crop off "index.html". Crop off trailing slashes except the root of the web site which is a lone slash. Added a shell script to preprocess the log data since that code was reused from another web analizer script.

2025-02-03 20:49:27 +00:00 · 2001-08-01 03:52:22 +00:00 · 2001-08-01 03:52:22 +00:00 · c9b38c329e
commit c9b38c329e
parent d026e97c8c
2 changed files with 19 additions and 1 deletions
--- a/webtools/webstats/addstats.pl
+++ b/webtools/webstats/addstats.pl
@ -96,7 +96,7 @@ while (<STDIN>) {
    }

    chomp;
-    if (! m@\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\].*GET (/\S*) HTTP@) {
+    if (! m@\[(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+).*\w+ (/\S*)@) {
 	next;
    }
 	
--- a/webtools/webstats/addstats.sh
+++ b/webtools/webstats/addstats.sh
@ -0,0 +1,18 @@
+#!/bin/sh
+
+# steal code from http log cruncher for ftp.mozilla.org.  yes, its hacky 
+# but it works
+#
+# awk '{print $4,$7,$9}' # Grab the date, url and http status code
+# grep 200$              # Only use the status code 200 (complete downloads)
+# awk '{print $1}'       # Remove http status code, leaving only the url
+# sed "s/\?.*//"         # Remove query strings
+# these should be equivalent ('...blah/index.html' == '...blah/' == '...blah')
+# sed "s/index.html//"   # Remove trailing index.html so these entries group
+                         # with urls without
+# sed "s/\(\w\)\/$/\1/"  # Remove ending slash on directories so they group
+                         # together. Not all directories have them. But don't
+                         # remove a lone slash. (the root url)
+
+
+zcat  $* |   awk '{print $4, $7, $9}' | grep 200$ | awk '{print $1, $2}'  | sed "s/index.html//" | sed "s/\?.*//" | sed "s/\(\w\)\/$/\1/" | ./addstats.pl