gecko-dev/tools/download-stats/process-logs.pl
2004-06-30 01:14:45 +00:00

281 lines
11 KiB
Perl
Executable File

#!/usr/bin/perl
################################################################################
# script initialization
use strict; # protect us from ourselves
use DBI; # database stuff
use Date::Parse;
use POSIX qw(strftime);
use Socket; # DNS queries
use File::Find; # grabbing the list of log files from the filesystem
#use Fcntl ':flock'; # import LOCK_* constants for locking log files
# XXX Should use File::Basename for splitting paths up into paths and filenames.
# XXX This stuff should go into a config file.
my $root_dir = "/data/ftp-logs";
my $sites = "aol|erlangen|gatech|indiana|isc|oregonstate|rediris|scarlet|utah"; # XXX Should be generated by a database query.
my $verbose = 1;
my $DO_REVERSE_DNS_LOOKUPS = 0;
chdir $root_dir
or die "Can't change to root dir $root_dir: $!";
# Figure out what period of time to process the logs from. We use
# a file's "modification time" attribute to store the most recent time
# at which logs were processed, and we process logs between that time
# and the present (i.e. from last processed time + 1 to the current time).
#my $timestamp_file = "$root_dir/last-processed";
#if (!-e $timestamp_file) {
# # Create the timestamp file and give it a timestamp way in the past.
# my $status = system("touch", "-t197001010000", $timestamp_file);
# if ($status != 0) { die "Couldn't touch $timestamp_file: $!" }
#}
#my ($read_time, $last_processed_time) = (stat($timestamp_file))[8,9];
#my $start_time = $last_processed_time + 1;
#my $end_time = time;
#utime($read_time, $end_time, $timestamp_file)
# or die "Can't update timestamp on $timestamp_file: $!";
# Regular expressions that grab data from the log entries; pre-defined
# and pre-compiled here for performance. The backslash in [^\"]
# isn't necessary for Perl but fixes indenting confusion in emacs.
my $common_log_regex = qr/^(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)] "(\S+) (.*?) (\S+)\/(\S+)" (\S+) (\S+) "([^\"]*)" "([^\"]*)"/o;
my $aol_log_regex = qr/(\w{3} \w{3} \d\d \d\d:\d\d:\d\d \d{4}) \d+ (\S+) (\d+) (.*?) (\S+) "([^\"]*)" "([^\"]*)"/o;
################################################################################
# database and query configuration
# Establish a database connection.
my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
my $dbh = DBI->connect($dsn,
"logprocessord",
"1ssw?w?",
{ RaiseError => 1,
PrintError => 0,
ShowErrorStatement => 1 }
);
# Prepare the statements we're going to use to insert HTTP log entries into
# the database.
my $insert_entry_sth = $dbh->prepare("INSERT INTO entries (id, protocol, protocol_version,
client, date_time, method, file_id, status, bytes,
query_id, site_id, log_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
my $insert_file_sth = $dbh->prepare("INSERT INTO files (id, path, name) VALUES (?, ?, ?)");
my $get_file_id_sth = $dbh->prepare("SELECT id FROM files WHERE path = ? AND name = ?");
my $get_log_status_sth = $dbh->prepare("SELECT id, status FROM logs WHERE path = ? AND name = ?");
my $insert_log_sth = $dbh->prepare("INSERT INTO logs (id, path, name, site_id, status) VALUES (?, ?, ?, ?, ?)");
my $update_log_sth = $dbh->prepare("UPDATE logs SET status = ? WHERE id = ?");
my $get_site_id_sth = $dbh->prepare("SELECT id FROM sites WHERE abbr = ?");
my $get_query_id_sth = $dbh->prepare("SELECT id FROM queries WHERE query = ?");
my $insert_query_sth = $dbh->prepare("INSERT INTO queries (id, query) VALUES (?, ?)");
# Get the last unique ID from the database so we know what the next one
# should be. XXX These assume only one script process will ever be running
# at a time, which is an unsafe assumption; fix this by locking the tables
# in question whenever a new entry is to be inserted and then running
# these queries to get the maximum IDs. Note that locking could be expensive,
# so perhaps it's better just to lock everything at the beginning and not let
# a second process access the database at all.
my ($entry_id) = $dbh->selectrow_array("SELECT MAX(id) FROM entries") || 0;
my ($max_file_id) = $dbh->selectrow_array("SELECT MAX(id) FROM files") || 0;
my ($max_query_id) = $dbh->selectrow_array("SELECT MAX(id) FROM queries") || 0;
my $seen = 0;
my $entered = 0;
my ($client, $date_time, $method, $file, $protocol, $protocol_version,
$status, $bytes, $referer, $user_agent, $host, $file_id, $path, $filename);
my %hosts;
my %files;
################################################################################
# main body
find(\&process_log, $root_dir);
################################################################################
# functions
sub process_log {
# Processes a log file, inserting relevant entries into the database.
# Called from File::Find::find with $_ containing the filename,
# $File::Find::dir containing the path, and $File::Find::name
# containing the path + name.
my $log_seen = 0;
my $log_entered = 0;
$File::Find::name && $File::Find::dir && $_
or die "process_log() called without name of file: $File::Find::name\n";
my $logfile = $_;
$File::Find::dir =~ m|^$root_dir/(.*)$|;
my $relative_path = $1 || $File::Find::dir;
# Don't process the file if it's a directory.
if (-d $logfile) {
print "Not processing $File::Find::name; directory\n";
return;
}
# Don't process the file if it isn't a log file.
# XXX This test may be too brittle, assuming a certain directory
# and file structure. It does, however, deal with HTTP logs
# which aren't in an http/ subdirectory of the site directory.
if ($File::Find::name !~ m!^$root_dir/($sites)/(http/|trillian/)?$logfile$!) {
print "Not processing $File::Find::name; not an HTTP log file.\n";
return;
}
# Grab the site's unique ID from the sites table.
my $site = $1;
my ($site_id) = $dbh->selectrow_array($get_site_id_sth, {}, $site);
if (!$site_id) {
print "Not processing $File::Find::name; couldn't find an entry " .
"in the sites table for $site.\n";
return;
}
# The name of the log file without any suffix indicating compression.
# This script treats foo.log and foo.log.gz as the same file so we can
# compress logs after processing them and they won't get reprocessed.
# Note that the "name" column in the "logs" table stores the base name
# of the log file, not the actual name.
my $basename = $logfile;
$basename =~ s/\.(gz|bz2)$//;
# Get the log file's unique ID and status from the database.
my ($log_id, $status) =
$dbh->selectrow_array($get_log_status_sth, {}, $relative_path, $basename);
if (!$log_id) {
print "Creating entry in database for log $File::Find::name.\n";
#$dbh->do("LOCK TABLES logs WRITE");
($log_id) = $dbh->selectrow_array("SELECT MAX(id) FROM logs");
$log_id = ($log_id || 0) + 1;
$insert_log_sth->execute($log_id, $relative_path, $basename, $site_id, "new");
#$dbh->do("UNLOCK TABLES");
}
elsif ($status eq "processed") {
print "Not processing log $File::Find::name; already processed.\n";
return;
}
elsif ($status eq "processing") {
print "Not processing log $File::Find::name; already being processed.\n";
return;
}
print "Processing $File::Find::name.\n";
$update_log_sth->execute("processing", $log_id) || die $dbh->errstr;
if ($logfile =~ /\.gz$/) {
open(LOGFILE, "gunzip -c $File::Find::name |")
or die "Couldn't open gzipped file $File::Find::name for reading: $!";
}
elsif ($logfile =~ /\.bz2$/) {
open(LOGFILE, "bunzip2 -c $File::Find::name |")
or die "Couldn't open bzip2ed file $File::Find::name for reading: $!";
}
else {
open(LOGFILE, "< $File::Find::name")
or die "Couldn't open file for reading: $!";
}
while (<LOGFILE>) {
# Periodically print out a message about our progress
# so users know if the script has frozen or is going slowly.
++$seen;
++$log_seen;
print "Processed $log_entered/$log_seen entries for $relative_path/$logfile ($entered/$seen total).\n"
if ($seen % 1000 == 0) && $verbose;
if ($File::Find::name =~ /\.http_trans$/) {
($date_time, $client, $bytes, $file, $status, $user_agent, $referer)
= ($_ =~ $aol_log_regex);
$method = $protocol = $protocol_version = undef;
}
else {
($client, $date_time, $method, $file, $protocol, $protocol_version,
$status, $bytes, $referer, $user_agent) = ($_ =~ $common_log_regex);
}
#print "$client, $date_time, $method, $file, $protocol, $protocol_version, $status, $bytes, $referer, $user_agent\n";
# Count only successful requests (whether partial or full).
next unless $status == 200 || $status == 206;
# Strip the URL query string, if any, from the file string.
($file, my $query) = (split(/\?/, $file));
my $query_id;
if ($query) {
($query_id) = $dbh->selectrow_array($get_query_id_sth, {}, $query);
if (!$query_id) {
$query_id = ++$max_query_id;
$insert_query_sth->execute($query_id, $query);
}
}
# Split up the file string into a path and a name.
$file =~ /^(.*)\/([^\/]*)$/;
($path, $filename) = ($1, $2);
# Only deal with releases, extensions, webtools, and language packs at this point.
next if $path !~ /releases/ && $path !~ /extensions/ && $path !~ /webtools/ && $path !~ /mozilla\/l10n\/lang/;
# Don't bother storing directory accesses, since we don't do anything with them.
next if !$filename;
# Get the file's unique ID or create a record for it if none exists yet.
$file_id = $files{$file};
if (!$file_id) {
($file_id) = $dbh->selectrow_array($get_file_id_sth, {}, $path, $filename);
if ($file_id) { $files{$file} = $file_id }
else {
$file_id = ++$max_file_id;
$insert_file_sth->execute($file_id, $path, $filename || undef);
}
}
# Convert the timestamp into MySQL's format (including folding the timezone
# into the time to convert it to local time, since MySQL DATETIME types
# don't store timezone information).
$date_time = strftime("%Y/%m/%d %H:%M:%S", localtime(str2time($date_time)));
if ($DO_REVERSE_DNS_LOOKUPS) {
# Do a reverse DNS lookup to get the domain name from the IP address.
$host = $hosts{$client};
if (!$host) {
if ($client =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {
$host = gethostbyaddr(inet_aton($client), AF_INET) || $client;
}
else {
$host = $client;
}
$hosts{$client} = $host;
}
}
else {
$host = $client;
}
#print "$client = $host\n";
# Insert the log entry into the database. We increment
# the ID so this entry has the next unique ID, and we make
# the filename be NULL if it doesn't exist because that's
# easier for queries to understand than a blank string.
++$entry_id;
++$entered;
++$log_entered;
$insert_entry_sth->execute($entry_id, $protocol, $protocol_version, $host,
$date_time, $method, $file_id, $status, $bytes,
$query_id, $site_id, $log_id);
}
close(LOGFILE);
$update_log_sth->execute("processed", $log_id) || die $dbh->errstr;
}