initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them

2004-05-13 21:41:14 +00:00 · 2004-05-13 21:41:14 +00:00 · 14b0e293c9
--- a/tools/download-stats/generate-stats.pl
+++ b/tools/download-stats/generate-stats.pl
@ -0,0 +1,209 @@
+#!/usr/bin/perl
+
+use DBI;
+use LWP::Simple;
+use Template;
+use strict;
+
+# Establish a database connection.
+my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
+my $dbh = DBI->connect($dsn,
+                       "logprocessord",
+                       "1ssw?w?",
+                       { RaiseError => 1,
+                         PrintError => 0,
+                         ShowErrorStatement => 1 }
+                      );
+
+################################################################################
+# Stats Configuration
+
+# All these variables can be redefined in the stats definition file.
+# These are just the default values.
+
+# List of stats to generate.
+our @stats;
+
+# Date range to which we limit the query.
+our ($start_date, $end_date);
+
+# Whether or not to try to add up partial downloads from the same client
+# to see if they count as a complete download.  Doesn't work well without
+# DNS lookups, which we aren't doing for performance reasons.
+our $do_segment_count = 0;
+
+# Parse the definition file and make sure it defines some stats.
+my $stats_defs = $ARGV[0];
+defined($stats_defs) or die "You didn't reference a stats definition file.\n";
+do $stats_defs || die "Couldn't parse stats definition file: $!\n";
+defined(@stats) or die "The stats definition file didn't define any stats.\n";
+
+################################################################################
+# Data Validation
+
+if ($start_date) {
+    $start_date =~ /^\d\d\d\d-\d\d-\d\d( \d\d:\d\d(:\d\d)?)?$/
+      or die "Invalid start date $start_date (must be in format yyyy-mm-dd (hh:mm(:ss)?)?).";
+}
+
+if ($end_date) {
+    $end_date =~ /^\d\d\d\d-\d\d-\d\d( \d\d:\d\d(:\d\d)?)?$/
+      or die "Invalid end date $end_date (must be in format yyyy-mm-dd (hh:mm(:ss)?)?).";
+}
+
+################################################################################
+# Queries
+
+my @date_criteria = ("1=1");
+if ($start_date) { push(@date_criteria, "date_time >= '$start_date'") }
+if ($end_date)   { push(@date_criteria, "date_time <= '$end_date'") }
+my $date_clause = join(" AND ", @date_criteria);
+
+# Completed downloads.
+my $done = 
+    $dbh->prepare("SELECT COUNT(*) FROM entries JOIN files ON entries.file_id = files.id " .
+                  "WHERE $date_clause AND files.path = ? AND files.name = ? AND bytes = ?");
+
+# Not completed downloads.
+my $not_done = 
+    $dbh->prepare("SELECT COUNT(*) FROM entries JOIN files ON entries.file_id = files.id " .
+                  "WHERE $date_clause AND files.path = ? AND files.name = ? AND bytes != ? " .
+                  "AND status = 200");
+
+# Partial content requests; may or may not be completed.
+my $may_be_done = 
+    $dbh->prepare("SELECT COUNT(*) FROM entries JOIN files ON entries.file_id = files.id " .
+                  "WHERE $date_clause AND files.path = ? AND files.name = ? AND status = 206");
+
+# A way to get the count of people who altogether completed a download.
+# Only run if $do_segment_count is true.  Note that this query is expensive
+# and only ever returns a fraction of the total, so it's not that useful.
+# Also, it probably doesn't work unless we reverse DNS every address
+# in the logs, which we aren't doing at the moment for performance.
+my $done_in_segments = 
+    $dbh->prepare("SELECT 1 FROM entries JOIN files ON entries.file_id = files.id " .
+                  "WHERE $date_clause AND files.path = ? AND files.name = ? " .
+                  "GROUP BY client HAVING SUM(bytes) = ? AND COUNT(bytes) > 1");
+
+
+################################################################################
+# Stats Retrieval
+
+foreach my $stat (@stats) {
+    next if !$stat->{isactive};
+
+    print STDERR "$stat->{name} $stat->{version}...\n";
+
+    my $platforms = $stat->{platforms};
+
+    foreach my $platform (keys %$platforms) {
+	print STDERR "  $platform\n";
+
+        my $files = $platforms->{$platform};
+
+	foreach my $type (keys %$files) {
+	    print STDERR "    $type: ";
+
+	    my $file = $files->{$type};
+
+	    my (undef, $file_size) = head("http://ftp.mozilla.org$stat->{path}/$file->{name}");
+            $file_size ||= $file->{size}
+	      or die "Can't figure out the size of $stat->{path}/$file->{name}.";
+
+            $done->execute($stat->{path}, $file->{name}, $file_size);
+	    my ($done_count) = $done->fetchrow_array();
+
+	    my $done_in_segments_count = "N/A";
+	    my $total_done = $done_count;
+	    if ($do_segment_count) {
+		$done_in_segments->execute($stat->{path}, $file->{name}, $file_size);
+		$done_in_segments_count = $done_in_segments->fetchall_arrayref();
+		$done_in_segments_count = scalar(@$done_in_segments_count);
+		$total_done += $done_in_segments_count;
+	    }
+
+            $not_done->execute($stat->{path}, $file->{name}, $file_size);
+	    my ($not_done_count) = $not_done->fetchrow_array();
+
+            $may_be_done->execute($stat->{path}, $file->{name});
+	    my ($may_be_done_count) = $may_be_done->fetchrow_array();
+
+            $file->{counts} = {
+              complete_uni   => $done_count,
+              #complete_multi => $done_in_segments_count, 
+              incomplete     => $not_done_count, 
+              partial        => $may_be_done_count, 
+            };
+
+	    print STDERR "$done_count / $not_done_count / $may_be_done_count / $done_in_segments_count / $total_done\n";
+	}
+    }
+}
+
+################################################################################
+# Output
+
+my $template = <<'EOF';
+<html>
+<head>
+  <title></title>
+  <style type="text/css">
+    th { text-align: left; }
+    th, td { border: solid 1px black; }
+    table { border-collapse: collapse;
+            border: solid 1px black; }
+  </style>
+</head>
+<body>
+[% FOREACH stat = stats %]
+  [% NEXT IF !stat.isactive %]
+  [% app_total = 0 %]
+  <h2>[% stat.name %] [%+ stat.version %] Download Stats</h2>
+
+  <p>[% start_date || "the beginning of time" %] to [% end_date || "the end of time" %]</p>
+
+  <table summary="[% stat.name %] [%+ stat.version %] Downloads">
+    <tr>
+      <th>Build</th>
+      <th>Downloads</th>
+    </tr>
+    [% FOREACH platform = stat.platforms %]
+      [% platform_total = 0 %]
+
+      <tr>
+        <td colspan="2"><h3>[% platform.key %]</h3></td>
+      </tr>
+
+      [% files = platform.value %]
+      [% FOREACH file = files %]
+        [% file_total = file.value.counts.complete_uni + file.value.counts.complete_multi %]
+        <tr>
+          <td>[% file.key %]:[% file.value.name %]</td>
+          <td>[% file_total %]</td>
+        </tr>
+        [% platform_total = platform_total + file_total %]
+      [% END %]
+
+      <tr>
+        <td>total for [% platform.key %]</td>
+        <td>[% platform_total %]</td>
+      </tr>
+
+      [% app_total = app_total + platform_total %]
+    [% END %]
+    <tr>
+      <td>grand total</td>
+      <td>[% app_total %]</td>
+    </tr>
+  </table>
+[% END %]
+</body>
+</html>
+EOF
+
+my $tt = new Template({ PRE_CHOMP => 1, POST_CHOMP => 1});
+
+$tt->process(\$template, {stats => \@stats, 
+			  start_date => $start_date, 
+			  end_date => $end_date})
+    || die "Template process failed: ", $template->error(), "\n";
--- a/tools/download-stats/process-logs.pl
+++ b/tools/download-stats/process-logs.pl
@ -0,0 +1,259 @@
+#!/usr/bin/perl
+
+################################################################################
+# script initialization
+
+use strict; # protect us from ourselves
+use DBI; # database stuff
+use Date::Parse;
+use POSIX qw(strftime);
+use Socket; # DNS queries
+use File::Find; # grabbing the list of log files from the filesystem
+#use Fcntl ':flock'; # import LOCK_* constants for locking log files
+
+# The place to put the results of running this script.
+my $LOG = "/var/log/last-process-logs.log";
+
+# XXX Probably should use File::Basename for splitting paths up into paths and filenames.
+
+# Stuff that should really go into a config file.
+my $root_dir = "/data/ftp-logs";
+my $sites = "aol|gatech|indiana|isc|oregonstate|rediris|scarlet|utah"; # XXX Maybe this should be generated by a database query.
+my $verbose = 1;
+my $DO_REVERSE_DNS_LOOKUPS = 0;
+
+# Figure out what period of time to process the logs from.  We use
+# a file's "modification time" attribute to store the most recent time
+# at which logs were processed, and we process logs between that time
+# and the present (i.e. from last processed time + 1 to the current time).
+#my $timestamp_file = "$root_dir/last-processed";
+#if (!-e $timestamp_file) {
+#    # Create the timestamp file and give it a timestamp way in the past.
+#    my $status = system("touch", "-t197001010000", $timestamp_file);
+#    if ($status != 0) { die "Couldn't touch $timestamp_file: $!" }
+#}
+#my ($read_time, $last_processed_time) = (stat($timestamp_file))[8,9];
+#my $start_time = $last_processed_time + 1;
+#my $end_time = time;
+#utime($read_time, $end_time, $timestamp_file) 
+#  or die "Can't update timestamp on $timestamp_file: $!";
+
+#CREATE TABLE entries (id INT PRIMARY KEY, protocol VARCHAR(4), protocol_version VARCHAR(5), client VARCHAR(15), date_time DATETIME, method VARCHAR(4), file_id INT, status CHAR(3), bytes INT, site_id TINYINT, log_id INT);
+
+# Regular expressions that grab data from the log entries; pre-defined
+# and pre-compiled here for performance.  The backslash in [^\"]
+# isn't necessary for Perl but fixes indenting confusion in emacs.
+my $common_log_regex = qr/^(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)] "(\S+) (.*?) (\S+)\/(\S+)" (\S+) (\S+) "([^\"]*)" "([^\"]*)"/o;
+my $aol_log_regex = qr/(\w{3} \w{3} \d\d \d\d:\d\d:\d\d \d{4}) \d+ (\S+) (\d+) (.*?) (\S+) "([^\"]*)" "([^\"]*)"/o;
+
+################################################################################
+# database and query configuration
+
+# Establish a database connection.
+my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
+my $dbh = DBI->connect($dsn,
+                       "logprocessord",
+                       "1ssw?w?",
+                       { RaiseError => 1,
+                         PrintError => 0,
+                         ShowErrorStatement => 1 }
+                      );
+
+# Prepare the statements we're going to use to insert HTTP log entries into
+# the database.
+my $insert_entry_sth = $dbh->prepare("INSERT INTO entries (id, protocol, protocol_version, 
+                                      client, date_time, method, file_id, status, bytes, site_id, log_id)
+                                      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
+my $insert_file_sth = $dbh->prepare("INSERT INTO files (id, path, name) VALUES (?, ?, ?)");
+my $get_file_id_sth = $dbh->prepare("SELECT id FROM files WHERE path = ? AND name = ?");
+my $get_log_status_sth = $dbh->prepare("SELECT id, status FROM logs WHERE path = ? AND name = ?");
+my $insert_log_sth = $dbh->prepare("INSERT INTO logs (id, path, name, site_id, status) VALUES (?, ?, ?, ?, ?)");
+my $update_log_sth = $dbh->prepare("UPDATE logs SET status = ? WHERE id = ?");
+my $get_site_id_sth = $dbh->prepare("SELECT id FROM sites WHERE abbr = ?");
+
+# Get the last unique ID from the database so we know what the next one 
+# should be. XXX These assume only one script process will ever be running
+# at a time, which is an unsafe assumption; fix this by locking the tables
+# in question whenever a new entry is to be inserted and then running
+# these queries to get the maximum IDs.  Note that locking could be expensive,
+# so perhaps it's better just to lock everything at the beginning and not let
+# a second process access the database at all.
+my ($entry_id) = $dbh->selectrow_array("SELECT MAX(id) FROM entries") || 0;
+my ($max_file_id) = $dbh->selectrow_array("SELECT MAX(id) FROM files") || 0;
+
+my $seen = 0;
+my $entered = 0;
+
+my ($client, $date_time, $method, $file, $protocol, $protocol_version, 
+    $status, $bytes, $referer, $user_agent, $host, $file_id, $path, $filename);
+
+my %hosts;
+my %files;
+
+
+################################################################################
+# main body
+
+open(LOG, ">", $LOG) or die "Can't open $LOG: $!";
+find(\&process_log, $root_dir);
+close(LOG);
+
+################################################################################
+# functions
+
+sub process_log {
+    # Processes a log file, inserting relevant entries into the database.
+    # Called from File::Find::find with $_ containing the filename, 
+    # $File::Find::dir containing the path, and $File::Find::name 
+    # containing the path + name.
+
+    my $log_seen = 0;
+    my $log_entered = 0;
+
+    $File::Find::name && $File::Find::dir && $_ 
+      or die "process_log() called without name of file: $File::Find::name\n";
+
+    my $logfile = $_;
+    $File::Find::dir =~ m|^$root_dir/(.*)$|;
+    my $relative_path = $1 || $File::Find::dir;
+
+    # Don't process the file if it's a directory.
+    if (-d $logfile) {
+	print LOG "Not processing $File::Find::name; directory\n";
+	return;
+    }
+
+    # Don't process the file if it isn't a log file.
+    # XXX This test may be too brittle, assuming a certain directory
+    # and file structure.  It does, however, deal with HTTP logs
+    # which aren't in an http/ subdirectory of the site directory.
+    if ($File::Find::name !~ m|^$root_dir/($sites)/(http/)?$logfile$|) {
+	print LOG "Not processing $File::Find::name; not an HTTP log file.\n";
+	return;
+    }
+
+    # Grab the site's unique ID from the sites table.
+    my $site = $1;
+    my ($site_id) = $dbh->selectrow_array($get_site_id_sth, {}, $site);
+    if (!$site_id) {
+	print LOG "Not processing $File::Find::name; couldn't find an entry " . 
+	             "in the sites table for $site.\n";
+	return;
+    }
+
+    # Get the log file's unique ID and status from the database.
+    my ($log_id, $status) = 
+	$dbh->selectrow_array($get_log_status_sth, {}, $relative_path, $logfile);
+
+    if (!$log_id) {
+	print LOG "Creating entry in database for log $File::Find::name.\n";
+	#$dbh->do("LOCK TABLES logs WRITE");
+	($log_id) = $dbh->selectrow_array("SELECT MAX(id) FROM logs");
+	$log_id = ($log_id || 0) + 1;
+	$insert_log_sth->execute($log_id, $relative_path, $logfile, $site_id, "new");
+	#$dbh->do("UNLOCK TABLES");
+    }
+    elsif ($status eq "processed") {
+	print LOG "Not processing log $File::Find::name; already processed.\n";
+	return;
+    }
+    elsif ($status eq "processing") {
+	print LOG "Not processing log $File::Find::name; already being processed.\n";
+	return;
+    }
+
+    print LOG "Processing $File::Find::name.\n";
+
+    $update_log_sth->execute("processing", $log_id) || die $dbh->errstr;
+
+    if ($logfile =~ /\.gz$/) {
+	open(LOGFILE, "gunzip -c $File::Find::name |")
+	  or die "Couldn't open gzipped file for reading: $!";
+    }
+    else {
+	open(LOGFILE, "< $File::Find::name")
+	  or die "Couldn't open file for reading: $!";
+    }
+    while (<LOGFILE>) {
+        # Periodically print out a message about our progress
+        # so users know if the script has frozen or is going slowly.
+        ++$seen;
+        ++$log_seen;
+        print LOG "Processed $log_entered/$log_seen entries for $relative_path/$logfile ($entered/$seen total).\n"
+	    if ($seen % 1000 == 0) && $verbose;
+
+	if ($File::Find::name =~ /\.http_trans$/) {
+	    ($date_time, $client, $bytes, $file, $status, $user_agent, $referer)
+		= ($_ =~ $aol_log_regex);
+	    $method = $protocol = $protocol_version = undef;
+	}
+	else {
+	    ($client, $date_time, $method, $file, $protocol, $protocol_version, 
+             $status, $bytes, $referer, $user_agent) = ($_ =~ $common_log_regex);
+	}
+	#print LOG "$client, $date_time, $method, $file, $protocol, $protocol_version, $status, $bytes, $referer, $user_agent\n";
+
+        # Count only successful requests (whether partial or full).
+        next unless $status == 200 || $status == 206;
+
+        # Split up the file string into a path and a name.
+        $file =~ /^(.*)\/([^\/]*)$/;
+        ($path, $filename) = ($1, $2);
+
+        # Only deal with releases, webtools, and language packs at this point.
+	next if $path !~ /releases/ && $path !~ /webtools/ && $path !~ /mozilla\/l10n\/lang/;
+
+        # Strip the URL query string, if any, from the filename.
+        $filename = (split(/\?/, $filename))[0];
+
+        # Don't bother storing directory accesses, since we don't do anything with them.
+	next if !$filename;
+
+        # Get the file's unique ID or create a record for it if none exists yet.
+        $file_id = $files{$file};
+        if (!$file_id) {
+            ($file_id) = $dbh->selectrow_array($get_file_id_sth, {}, $path, $filename);
+	    if ($file_id) { $files{$file} = $file_id }
+            else {
+	        $file_id = ++$max_file_id;
+	        $insert_file_sth->execute($file_id, $path, $filename || undef);
+            }
+        }
+
+        # Convert the timestamp into MySQL's format (including folding the timezone
+        # into the time to convert it to local time, since MySQL DATETIME types
+        # don't store timezone information).
+        $date_time = strftime("%Y/%m/%d %H:%M:%S", localtime(str2time($date_time)));
+
+        if ($DO_REVERSE_DNS_LOOKUPS) {
+	    # Do a reverse DNS lookup to get the domain name from the IP address.
+	    $host = $hosts{$client};
+            if (!$host) {
+		if ($client =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {
+		    $host = gethostbyaddr(inet_aton($client), AF_INET) || $client;
+		}
+		else {
+		    $host = $client;
+		}
+		$hosts{$client} = $host;
+            }
+        }
+	else {
+	    $host = $client;
+	}
+	#print LOG "$client = $host\n";
+
+        # Insert the log entry into the database.  We increment
+        # the ID so this entry has the next unique ID, and we make
+        # the filename be NULL if it doesn't exist because that's
+        # easier for queries to understand than a blank string.
+        ++$entry_id;
+        ++$entered;
+        ++$log_entered;
+        $insert_entry_sth->execute($entry_id, $protocol, $protocol_version, $host, 
+                                   $date_time, $method, $file_id, $status, $bytes, 
+				   $site_id, $log_id);
+    }
+    close(LOGFILE);
+    $update_log_sth->execute("processed", $log_id) || die $dbh->errstr;
+}
--- a/tools/download-stats/sample-stats-config.pl
+++ b/tools/download-stats/sample-stats-config.pl