mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-08 20:47:44 +00:00
281 lines
11 KiB
Perl
Executable File
281 lines
11 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
|
|
################################################################################
|
|
# script initialization
|
|
|
|
use strict; # protect us from ourselves
|
|
use DBI; # database stuff
|
|
use Date::Parse;
|
|
use POSIX qw(strftime);
|
|
use Socket; # DNS queries
|
|
use File::Find; # grabbing the list of log files from the filesystem
|
|
#use Fcntl ':flock'; # import LOCK_* constants for locking log files
|
|
|
|
# XXX Should use File::Basename for splitting paths up into paths and filenames.
|
|
|
|
# XXX This stuff should go into a config file.
|
|
my $root_dir = "/data/ftp-logs";
|
|
my $sites = "aol|erlangen|gatech|indiana|isc|oregonstate|rediris|scarlet|utah"; # XXX Should be generated by a database query.
|
|
my $verbose = 1;
|
|
my $DO_REVERSE_DNS_LOOKUPS = 0;
|
|
|
|
chdir $root_dir
|
|
or die "Can't change to root dir $root_dir: $!";
|
|
|
|
# Figure out what period of time to process the logs from. We use
|
|
# a file's "modification time" attribute to store the most recent time
|
|
# at which logs were processed, and we process logs between that time
|
|
# and the present (i.e. from last processed time + 1 to the current time).
|
|
#my $timestamp_file = "$root_dir/last-processed";
|
|
#if (!-e $timestamp_file) {
|
|
# # Create the timestamp file and give it a timestamp way in the past.
|
|
# my $status = system("touch", "-t197001010000", $timestamp_file);
|
|
# if ($status != 0) { die "Couldn't touch $timestamp_file: $!" }
|
|
#}
|
|
#my ($read_time, $last_processed_time) = (stat($timestamp_file))[8,9];
|
|
#my $start_time = $last_processed_time + 1;
|
|
#my $end_time = time;
|
|
#utime($read_time, $end_time, $timestamp_file)
|
|
# or die "Can't update timestamp on $timestamp_file: $!";
|
|
|
|
# Regular expressions that grab data from the log entries; pre-defined
|
|
# and pre-compiled here for performance. The backslash in [^\"]
|
|
# isn't necessary for Perl but fixes indenting confusion in emacs.
|
|
my $common_log_regex = qr/^(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)] "(\S+) (.*?) (\S+)\/(\S+)" (\S+) (\S+) "([^\"]*)" "([^\"]*)"/o;
|
|
my $aol_log_regex = qr/(\w{3} \w{3} \d\d \d\d:\d\d:\d\d \d{4}) \d+ (\S+) (\d+) (.*?) (\S+) "([^\"]*)" "([^\"]*)"/o;
|
|
|
|
################################################################################
|
|
# database and query configuration
|
|
|
|
# Establish a database connection.
|
|
my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
|
|
my $dbh = DBI->connect($dsn,
|
|
"logprocessord",
|
|
"1ssw?w?",
|
|
{ RaiseError => 1,
|
|
PrintError => 0,
|
|
ShowErrorStatement => 1 }
|
|
);
|
|
|
|
# Prepare the statements we're going to use to insert HTTP log entries into
|
|
# the database.
|
|
my $insert_entry_sth = $dbh->prepare("INSERT INTO entries (id, protocol, protocol_version,
|
|
client, date_time, method, file_id, status, bytes,
|
|
query_id, site_id, log_id)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
|
|
my $insert_file_sth = $dbh->prepare("INSERT INTO files (id, path, name) VALUES (?, ?, ?)");
|
|
my $get_file_id_sth = $dbh->prepare("SELECT id FROM files WHERE path = ? AND name = ?");
|
|
my $get_log_status_sth = $dbh->prepare("SELECT id, status FROM logs WHERE path = ? AND name = ?");
|
|
my $insert_log_sth = $dbh->prepare("INSERT INTO logs (id, path, name, site_id, status) VALUES (?, ?, ?, ?, ?)");
|
|
my $update_log_sth = $dbh->prepare("UPDATE logs SET status = ? WHERE id = ?");
|
|
my $get_site_id_sth = $dbh->prepare("SELECT id FROM sites WHERE abbr = ?");
|
|
my $get_query_id_sth = $dbh->prepare("SELECT id FROM queries WHERE query = ?");
|
|
my $insert_query_sth = $dbh->prepare("INSERT INTO queries (id, query) VALUES (?, ?)");
|
|
|
|
# Get the last unique ID from the database so we know what the next one
|
|
# should be. XXX These assume only one script process will ever be running
|
|
# at a time, which is an unsafe assumption; fix this by locking the tables
|
|
# in question whenever a new entry is to be inserted and then running
|
|
# these queries to get the maximum IDs. Note that locking could be expensive,
|
|
# so perhaps it's better just to lock everything at the beginning and not let
|
|
# a second process access the database at all.
|
|
my ($entry_id) = $dbh->selectrow_array("SELECT MAX(id) FROM entries") || 0;
|
|
my ($max_file_id) = $dbh->selectrow_array("SELECT MAX(id) FROM files") || 0;
|
|
my ($max_query_id) = $dbh->selectrow_array("SELECT MAX(id) FROM queries") || 0;
|
|
|
|
my $seen = 0;
|
|
my $entered = 0;
|
|
|
|
my ($client, $date_time, $method, $file, $protocol, $protocol_version,
|
|
$status, $bytes, $referer, $user_agent, $host, $file_id, $path, $filename);
|
|
|
|
my %hosts;
|
|
my %files;
|
|
|
|
|
|
################################################################################
|
|
# main body
|
|
|
|
find(\&process_log, $root_dir);
|
|
|
|
################################################################################
|
|
# functions
|
|
|
|
sub process_log {
|
|
# Processes a log file, inserting relevant entries into the database.
|
|
# Called from File::Find::find with $_ containing the filename,
|
|
# $File::Find::dir containing the path, and $File::Find::name
|
|
# containing the path + name.
|
|
|
|
my $log_seen = 0;
|
|
my $log_entered = 0;
|
|
|
|
$File::Find::name && $File::Find::dir && $_
|
|
or die "process_log() called without name of file: $File::Find::name\n";
|
|
|
|
my $logfile = $_;
|
|
$File::Find::dir =~ m|^$root_dir/(.*)$|;
|
|
my $relative_path = $1 || $File::Find::dir;
|
|
|
|
# Don't process the file if it's a directory.
|
|
if (-d $logfile) {
|
|
print "Not processing $File::Find::name; directory\n";
|
|
return;
|
|
}
|
|
|
|
# Don't process the file if it isn't a log file.
|
|
# XXX This test may be too brittle, assuming a certain directory
|
|
# and file structure. It does, however, deal with HTTP logs
|
|
# which aren't in an http/ subdirectory of the site directory.
|
|
if ($File::Find::name !~ m!^$root_dir/($sites)/(http/|trillian/)?$logfile$!) {
|
|
print "Not processing $File::Find::name; not an HTTP log file.\n";
|
|
return;
|
|
}
|
|
|
|
# Grab the site's unique ID from the sites table.
|
|
my $site = $1;
|
|
my ($site_id) = $dbh->selectrow_array($get_site_id_sth, {}, $site);
|
|
if (!$site_id) {
|
|
print "Not processing $File::Find::name; couldn't find an entry " .
|
|
"in the sites table for $site.\n";
|
|
return;
|
|
}
|
|
|
|
# The name of the log file without any suffix indicating compression.
|
|
# This script treats foo.log and foo.log.gz as the same file so we can
|
|
# compress logs after processing them and they won't get reprocessed.
|
|
# Note that the "name" column in the "logs" table stores the base name
|
|
# of the log file, not the actual name.
|
|
my $basename = $logfile;
|
|
$basename =~ s/\.(gz|bz2)$//;
|
|
|
|
# Get the log file's unique ID and status from the database.
|
|
my ($log_id, $status) =
|
|
$dbh->selectrow_array($get_log_status_sth, {}, $relative_path, $basename);
|
|
|
|
if (!$log_id) {
|
|
print "Creating entry in database for log $File::Find::name.\n";
|
|
#$dbh->do("LOCK TABLES logs WRITE");
|
|
($log_id) = $dbh->selectrow_array("SELECT MAX(id) FROM logs");
|
|
$log_id = ($log_id || 0) + 1;
|
|
$insert_log_sth->execute($log_id, $relative_path, $basename, $site_id, "new");
|
|
#$dbh->do("UNLOCK TABLES");
|
|
}
|
|
elsif ($status eq "processed") {
|
|
print "Not processing log $File::Find::name; already processed.\n";
|
|
return;
|
|
}
|
|
elsif ($status eq "processing") {
|
|
print "Not processing log $File::Find::name; already being processed.\n";
|
|
return;
|
|
}
|
|
|
|
print "Processing $File::Find::name.\n";
|
|
|
|
$update_log_sth->execute("processing", $log_id) || die $dbh->errstr;
|
|
|
|
if ($logfile =~ /\.gz$/) {
|
|
open(LOGFILE, "gunzip -c $File::Find::name |")
|
|
or die "Couldn't open gzipped file $File::Find::name for reading: $!";
|
|
}
|
|
elsif ($logfile =~ /\.bz2$/) {
|
|
open(LOGFILE, "bunzip2 -c $File::Find::name |")
|
|
or die "Couldn't open bzip2ed file $File::Find::name for reading: $!";
|
|
}
|
|
else {
|
|
open(LOGFILE, "< $File::Find::name")
|
|
or die "Couldn't open file for reading: $!";
|
|
}
|
|
while (<LOGFILE>) {
|
|
# Periodically print out a message about our progress
|
|
# so users know if the script has frozen or is going slowly.
|
|
++$seen;
|
|
++$log_seen;
|
|
print "Processed $log_entered/$log_seen entries for $relative_path/$logfile ($entered/$seen total).\n"
|
|
if ($seen % 1000 == 0) && $verbose;
|
|
|
|
if ($File::Find::name =~ /\.http_trans$/) {
|
|
($date_time, $client, $bytes, $file, $status, $user_agent, $referer)
|
|
= ($_ =~ $aol_log_regex);
|
|
$method = $protocol = $protocol_version = undef;
|
|
}
|
|
else {
|
|
($client, $date_time, $method, $file, $protocol, $protocol_version,
|
|
$status, $bytes, $referer, $user_agent) = ($_ =~ $common_log_regex);
|
|
}
|
|
#print "$client, $date_time, $method, $file, $protocol, $protocol_version, $status, $bytes, $referer, $user_agent\n";
|
|
|
|
# Count only successful requests (whether partial or full).
|
|
next unless $status == 200 || $status == 206;
|
|
|
|
# Strip the URL query string, if any, from the file string.
|
|
($file, my $query) = (split(/\?/, $file));
|
|
|
|
my $query_id;
|
|
if ($query) {
|
|
($query_id) = $dbh->selectrow_array($get_query_id_sth, {}, $query);
|
|
if (!$query_id) {
|
|
$query_id = ++$max_query_id;
|
|
$insert_query_sth->execute($query_id, $query);
|
|
}
|
|
}
|
|
|
|
# Split up the file string into a path and a name.
|
|
$file =~ /^(.*)\/([^\/]*)$/;
|
|
($path, $filename) = ($1, $2);
|
|
|
|
# Only deal with releases, extensions, webtools, and language packs at this point.
|
|
next if $path !~ /releases/ && $path !~ /extensions/ && $path !~ /webtools/ && $path !~ /mozilla\/l10n\/lang/;
|
|
|
|
# Don't bother storing directory accesses, since we don't do anything with them.
|
|
next if !$filename;
|
|
|
|
# Get the file's unique ID or create a record for it if none exists yet.
|
|
$file_id = $files{$file};
|
|
if (!$file_id) {
|
|
($file_id) = $dbh->selectrow_array($get_file_id_sth, {}, $path, $filename);
|
|
if ($file_id) { $files{$file} = $file_id }
|
|
else {
|
|
$file_id = ++$max_file_id;
|
|
$insert_file_sth->execute($file_id, $path, $filename || undef);
|
|
}
|
|
}
|
|
|
|
# Convert the timestamp into MySQL's format (including folding the timezone
|
|
# into the time to convert it to local time, since MySQL DATETIME types
|
|
# don't store timezone information).
|
|
$date_time = strftime("%Y/%m/%d %H:%M:%S", localtime(str2time($date_time)));
|
|
|
|
if ($DO_REVERSE_DNS_LOOKUPS) {
|
|
# Do a reverse DNS lookup to get the domain name from the IP address.
|
|
$host = $hosts{$client};
|
|
if (!$host) {
|
|
if ($client =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {
|
|
$host = gethostbyaddr(inet_aton($client), AF_INET) || $client;
|
|
}
|
|
else {
|
|
$host = $client;
|
|
}
|
|
$hosts{$client} = $host;
|
|
}
|
|
}
|
|
else {
|
|
$host = $client;
|
|
}
|
|
#print "$client = $host\n";
|
|
|
|
# Insert the log entry into the database. We increment
|
|
# the ID so this entry has the next unique ID, and we make
|
|
# the filename be NULL if it doesn't exist because that's
|
|
# easier for queries to understand than a blank string.
|
|
++$entry_id;
|
|
++$entered;
|
|
++$log_entered;
|
|
$insert_entry_sth->execute($entry_id, $protocol, $protocol_version, $host,
|
|
$date_time, $method, $file_id, $status, $bytes,
|
|
$query_id, $site_id, $log_id);
|
|
}
|
|
close(LOGFILE);
|
|
$update_log_sth->execute("processed", $log_id) || die $dbh->errstr;
|
|
}
|