зеркало из https://github.com/mozilla/pjs.git
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them
This commit is contained in:
Родитель
93189a5531
Коммит
14b0e293c9
|
@ -0,0 +1,209 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
use DBI;
|
||||
use LWP::Simple;
|
||||
use Template;
|
||||
use strict;
|
||||
|
||||
# Establish a database connection.
|
||||
my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
|
||||
my $dbh = DBI->connect($dsn,
|
||||
"logprocessord",
|
||||
"1ssw?w?",
|
||||
{ RaiseError => 1,
|
||||
PrintError => 0,
|
||||
ShowErrorStatement => 1 }
|
||||
);
|
||||
|
||||
################################################################################
|
||||
# Stats Configuration
|
||||
|
||||
# All these variables can be redefined in the stats definition file.
|
||||
# These are just the default values.
|
||||
|
||||
# List of stats to generate.
|
||||
our @stats;
|
||||
|
||||
# Date range to which we limit the query.
|
||||
our ($start_date, $end_date);
|
||||
|
||||
# Whether or not to try to add up partial downloads from the same client
|
||||
# to see if they count as a complete download. Doesn't work well without
|
||||
# DNS lookups, which we aren't doing for performance reasons.
|
||||
our $do_segment_count = 0;
|
||||
|
||||
# Parse the definition file and make sure it defines some stats.
|
||||
my $stats_defs = $ARGV[0];
|
||||
defined($stats_defs) or die "You didn't reference a stats definition file.\n";
|
||||
do $stats_defs || die "Couldn't parse stats definition file: $!\n";
|
||||
defined(@stats) or die "The stats definition file didn't define any stats.\n";
|
||||
|
||||
################################################################################
|
||||
# Data Validation
|
||||
|
||||
if ($start_date) {
|
||||
$start_date =~ /^\d\d\d\d-\d\d-\d\d( \d\d:\d\d(:\d\d)?)?$/
|
||||
or die "Invalid start date $start_date (must be in format yyyy-mm-dd (hh:mm(:ss)?)?).";
|
||||
}
|
||||
|
||||
if ($end_date) {
|
||||
$end_date =~ /^\d\d\d\d-\d\d-\d\d( \d\d:\d\d(:\d\d)?)?$/
|
||||
or die "Invalid end date $end_date (must be in format yyyy-mm-dd (hh:mm(:ss)?)?).";
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Queries
|
||||
|
||||
my @date_criteria = ("1=1");
|
||||
if ($start_date) { push(@date_criteria, "date_time >= '$start_date'") }
|
||||
if ($end_date) { push(@date_criteria, "date_time <= '$end_date'") }
|
||||
my $date_clause = join(" AND ", @date_criteria);
|
||||
|
||||
# Completed downloads.
|
||||
my $done =
|
||||
$dbh->prepare("SELECT COUNT(*) FROM entries JOIN files ON entries.file_id = files.id " .
|
||||
"WHERE $date_clause AND files.path = ? AND files.name = ? AND bytes = ?");
|
||||
|
||||
# Not completed downloads.
|
||||
my $not_done =
|
||||
$dbh->prepare("SELECT COUNT(*) FROM entries JOIN files ON entries.file_id = files.id " .
|
||||
"WHERE $date_clause AND files.path = ? AND files.name = ? AND bytes != ? " .
|
||||
"AND status = 200");
|
||||
|
||||
# Partial content requests; may or may not be completed.
|
||||
my $may_be_done =
|
||||
$dbh->prepare("SELECT COUNT(*) FROM entries JOIN files ON entries.file_id = files.id " .
|
||||
"WHERE $date_clause AND files.path = ? AND files.name = ? AND status = 206");
|
||||
|
||||
# A way to get the count of people who altogether completed a download.
|
||||
# Only run if $do_segment_count is true. Note that this query is expensive
|
||||
# and only ever returns a fraction of the total, so it's not that useful.
|
||||
# Also, it probably doesn't work unless we reverse DNS every address
|
||||
# in the logs, which we aren't doing at the moment for performance.
|
||||
my $done_in_segments =
|
||||
$dbh->prepare("SELECT 1 FROM entries JOIN files ON entries.file_id = files.id " .
|
||||
"WHERE $date_clause AND files.path = ? AND files.name = ? " .
|
||||
"GROUP BY client HAVING SUM(bytes) = ? AND COUNT(bytes) > 1");
|
||||
|
||||
|
||||
################################################################################
|
||||
# Stats Retrieval
|
||||
|
||||
foreach my $stat (@stats) {
|
||||
next if !$stat->{isactive};
|
||||
|
||||
print STDERR "$stat->{name} $stat->{version}...\n";
|
||||
|
||||
my $platforms = $stat->{platforms};
|
||||
|
||||
foreach my $platform (keys %$platforms) {
|
||||
print STDERR " $platform\n";
|
||||
|
||||
my $files = $platforms->{$platform};
|
||||
|
||||
foreach my $type (keys %$files) {
|
||||
print STDERR " $type: ";
|
||||
|
||||
my $file = $files->{$type};
|
||||
|
||||
my (undef, $file_size) = head("http://ftp.mozilla.org$stat->{path}/$file->{name}");
|
||||
$file_size ||= $file->{size}
|
||||
or die "Can't figure out the size of $stat->{path}/$file->{name}.";
|
||||
|
||||
$done->execute($stat->{path}, $file->{name}, $file_size);
|
||||
my ($done_count) = $done->fetchrow_array();
|
||||
|
||||
my $done_in_segments_count = "N/A";
|
||||
my $total_done = $done_count;
|
||||
if ($do_segment_count) {
|
||||
$done_in_segments->execute($stat->{path}, $file->{name}, $file_size);
|
||||
$done_in_segments_count = $done_in_segments->fetchall_arrayref();
|
||||
$done_in_segments_count = scalar(@$done_in_segments_count);
|
||||
$total_done += $done_in_segments_count;
|
||||
}
|
||||
|
||||
$not_done->execute($stat->{path}, $file->{name}, $file_size);
|
||||
my ($not_done_count) = $not_done->fetchrow_array();
|
||||
|
||||
$may_be_done->execute($stat->{path}, $file->{name});
|
||||
my ($may_be_done_count) = $may_be_done->fetchrow_array();
|
||||
|
||||
$file->{counts} = {
|
||||
complete_uni => $done_count,
|
||||
#complete_multi => $done_in_segments_count,
|
||||
incomplete => $not_done_count,
|
||||
partial => $may_be_done_count,
|
||||
};
|
||||
|
||||
print STDERR "$done_count / $not_done_count / $may_be_done_count / $done_in_segments_count / $total_done\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Output
|
||||
|
||||
my $template = <<'EOF';
|
||||
<html>
|
||||
<head>
|
||||
<title></title>
|
||||
<style type="text/css">
|
||||
th { text-align: left; }
|
||||
th, td { border: solid 1px black; }
|
||||
table { border-collapse: collapse;
|
||||
border: solid 1px black; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
[% FOREACH stat = stats %]
|
||||
[% NEXT IF !stat.isactive %]
|
||||
[% app_total = 0 %]
|
||||
<h2>[% stat.name %] [%+ stat.version %] Download Stats</h2>
|
||||
|
||||
<p>[% start_date || "the beginning of time" %] to [% end_date || "the end of time" %]</p>
|
||||
|
||||
<table summary="[% stat.name %] [%+ stat.version %] Downloads">
|
||||
<tr>
|
||||
<th>Build</th>
|
||||
<th>Downloads</th>
|
||||
</tr>
|
||||
[% FOREACH platform = stat.platforms %]
|
||||
[% platform_total = 0 %]
|
||||
|
||||
<tr>
|
||||
<td colspan="2"><h3>[% platform.key %]</h3></td>
|
||||
</tr>
|
||||
|
||||
[% files = platform.value %]
|
||||
[% FOREACH file = files %]
|
||||
[% file_total = file.value.counts.complete_uni + file.value.counts.complete_multi %]
|
||||
<tr>
|
||||
<td>[% file.key %]:[% file.value.name %]</td>
|
||||
<td>[% file_total %]</td>
|
||||
</tr>
|
||||
[% platform_total = platform_total + file_total %]
|
||||
[% END %]
|
||||
|
||||
<tr>
|
||||
<td>total for [% platform.key %]</td>
|
||||
<td>[% platform_total %]</td>
|
||||
</tr>
|
||||
|
||||
[% app_total = app_total + platform_total %]
|
||||
[% END %]
|
||||
<tr>
|
||||
<td>grand total</td>
|
||||
<td>[% app_total %]</td>
|
||||
</tr>
|
||||
</table>
|
||||
[% END %]
|
||||
</body>
|
||||
</html>
|
||||
EOF
|
||||
|
||||
my $tt = new Template({ PRE_CHOMP => 1, POST_CHOMP => 1});
|
||||
|
||||
$tt->process(\$template, {stats => \@stats,
|
||||
start_date => $start_date,
|
||||
end_date => $end_date})
|
||||
|| die "Template process failed: ", $template->error(), "\n";
|
|
@ -0,0 +1,259 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
################################################################################
|
||||
# script initialization
|
||||
|
||||
use strict; # protect us from ourselves
|
||||
use DBI; # database stuff
|
||||
use Date::Parse;
|
||||
use POSIX qw(strftime);
|
||||
use Socket; # DNS queries
|
||||
use File::Find; # grabbing the list of log files from the filesystem
|
||||
#use Fcntl ':flock'; # import LOCK_* constants for locking log files
|
||||
|
||||
# The place to put the results of running this script.
|
||||
my $LOG = "/var/log/last-process-logs.log";
|
||||
|
||||
# XXX Probably should use File::Basename for splitting paths up into paths and filenames.
|
||||
|
||||
# Stuff that should really go into a config file.
|
||||
my $root_dir = "/data/ftp-logs";
|
||||
my $sites = "aol|gatech|indiana|isc|oregonstate|rediris|scarlet|utah"; # XXX Maybe this should be generated by a database query.
|
||||
my $verbose = 1;
|
||||
my $DO_REVERSE_DNS_LOOKUPS = 0;
|
||||
|
||||
# Figure out what period of time to process the logs from. We use
|
||||
# a file's "modification time" attribute to store the most recent time
|
||||
# at which logs were processed, and we process logs between that time
|
||||
# and the present (i.e. from last processed time + 1 to the current time).
|
||||
#my $timestamp_file = "$root_dir/last-processed";
|
||||
#if (!-e $timestamp_file) {
|
||||
# # Create the timestamp file and give it a timestamp way in the past.
|
||||
# my $status = system("touch", "-t197001010000", $timestamp_file);
|
||||
# if ($status != 0) { die "Couldn't touch $timestamp_file: $!" }
|
||||
#}
|
||||
#my ($read_time, $last_processed_time) = (stat($timestamp_file))[8,9];
|
||||
#my $start_time = $last_processed_time + 1;
|
||||
#my $end_time = time;
|
||||
#utime($read_time, $end_time, $timestamp_file)
|
||||
# or die "Can't update timestamp on $timestamp_file: $!";
|
||||
|
||||
#CREATE TABLE entries (id INT PRIMARY KEY, protocol VARCHAR(4), protocol_version VARCHAR(5), client VARCHAR(15), date_time DATETIME, method VARCHAR(4), file_id INT, status CHAR(3), bytes INT, site_id TINYINT, log_id INT);
|
||||
|
||||
# Regular expressions that grab data from the log entries; pre-defined
|
||||
# and pre-compiled here for performance. The backslash in [^\"]
|
||||
# isn't necessary for Perl but fixes indenting confusion in emacs.
|
||||
my $common_log_regex = qr/^(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)] "(\S+) (.*?) (\S+)\/(\S+)" (\S+) (\S+) "([^\"]*)" "([^\"]*)"/o;
|
||||
my $aol_log_regex = qr/(\w{3} \w{3} \d\d \d\d:\d\d:\d\d \d{4}) \d+ (\S+) (\d+) (.*?) (\S+) "([^\"]*)" "([^\"]*)"/o;
|
||||
|
||||
################################################################################
|
||||
# database and query configuration
|
||||
|
||||
# Establish a database connection.
|
||||
my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
|
||||
my $dbh = DBI->connect($dsn,
|
||||
"logprocessord",
|
||||
"1ssw?w?",
|
||||
{ RaiseError => 1,
|
||||
PrintError => 0,
|
||||
ShowErrorStatement => 1 }
|
||||
);
|
||||
|
||||
# Prepare the statements we're going to use to insert HTTP log entries into
|
||||
# the database.
|
||||
my $insert_entry_sth = $dbh->prepare("INSERT INTO entries (id, protocol, protocol_version,
|
||||
client, date_time, method, file_id, status, bytes, site_id, log_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
|
||||
my $insert_file_sth = $dbh->prepare("INSERT INTO files (id, path, name) VALUES (?, ?, ?)");
|
||||
my $get_file_id_sth = $dbh->prepare("SELECT id FROM files WHERE path = ? AND name = ?");
|
||||
my $get_log_status_sth = $dbh->prepare("SELECT id, status FROM logs WHERE path = ? AND name = ?");
|
||||
my $insert_log_sth = $dbh->prepare("INSERT INTO logs (id, path, name, site_id, status) VALUES (?, ?, ?, ?, ?)");
|
||||
my $update_log_sth = $dbh->prepare("UPDATE logs SET status = ? WHERE id = ?");
|
||||
my $get_site_id_sth = $dbh->prepare("SELECT id FROM sites WHERE abbr = ?");
|
||||
|
||||
# Get the last unique ID from the database so we know what the next one
|
||||
# should be. XXX These assume only one script process will ever be running
|
||||
# at a time, which is an unsafe assumption; fix this by locking the tables
|
||||
# in question whenever a new entry is to be inserted and then running
|
||||
# these queries to get the maximum IDs. Note that locking could be expensive,
|
||||
# so perhaps it's better just to lock everything at the beginning and not let
|
||||
# a second process access the database at all.
|
||||
my ($entry_id) = $dbh->selectrow_array("SELECT MAX(id) FROM entries") || 0;
|
||||
my ($max_file_id) = $dbh->selectrow_array("SELECT MAX(id) FROM files") || 0;
|
||||
|
||||
my $seen = 0;
|
||||
my $entered = 0;
|
||||
|
||||
my ($client, $date_time, $method, $file, $protocol, $protocol_version,
|
||||
$status, $bytes, $referer, $user_agent, $host, $file_id, $path, $filename);
|
||||
|
||||
my %hosts;
|
||||
my %files;
|
||||
|
||||
|
||||
################################################################################
|
||||
# main body
|
||||
|
||||
open(LOG, ">", $LOG) or die "Can't open $LOG: $!";
|
||||
find(\&process_log, $root_dir);
|
||||
close(LOG);
|
||||
|
||||
################################################################################
|
||||
# functions
|
||||
|
||||
sub process_log {
|
||||
# Processes a log file, inserting relevant entries into the database.
|
||||
# Called from File::Find::find with $_ containing the filename,
|
||||
# $File::Find::dir containing the path, and $File::Find::name
|
||||
# containing the path + name.
|
||||
|
||||
my $log_seen = 0;
|
||||
my $log_entered = 0;
|
||||
|
||||
$File::Find::name && $File::Find::dir && $_
|
||||
or die "process_log() called without name of file: $File::Find::name\n";
|
||||
|
||||
my $logfile = $_;
|
||||
$File::Find::dir =~ m|^$root_dir/(.*)$|;
|
||||
my $relative_path = $1 || $File::Find::dir;
|
||||
|
||||
# Don't process the file if it's a directory.
|
||||
if (-d $logfile) {
|
||||
print LOG "Not processing $File::Find::name; directory\n";
|
||||
return;
|
||||
}
|
||||
|
||||
# Don't process the file if it isn't a log file.
|
||||
# XXX This test may be too brittle, assuming a certain directory
|
||||
# and file structure. It does, however, deal with HTTP logs
|
||||
# which aren't in an http/ subdirectory of the site directory.
|
||||
if ($File::Find::name !~ m|^$root_dir/($sites)/(http/)?$logfile$|) {
|
||||
print LOG "Not processing $File::Find::name; not an HTTP log file.\n";
|
||||
return;
|
||||
}
|
||||
|
||||
# Grab the site's unique ID from the sites table.
|
||||
my $site = $1;
|
||||
my ($site_id) = $dbh->selectrow_array($get_site_id_sth, {}, $site);
|
||||
if (!$site_id) {
|
||||
print LOG "Not processing $File::Find::name; couldn't find an entry " .
|
||||
"in the sites table for $site.\n";
|
||||
return;
|
||||
}
|
||||
|
||||
# Get the log file's unique ID and status from the database.
|
||||
my ($log_id, $status) =
|
||||
$dbh->selectrow_array($get_log_status_sth, {}, $relative_path, $logfile);
|
||||
|
||||
if (!$log_id) {
|
||||
print LOG "Creating entry in database for log $File::Find::name.\n";
|
||||
#$dbh->do("LOCK TABLES logs WRITE");
|
||||
($log_id) = $dbh->selectrow_array("SELECT MAX(id) FROM logs");
|
||||
$log_id = ($log_id || 0) + 1;
|
||||
$insert_log_sth->execute($log_id, $relative_path, $logfile, $site_id, "new");
|
||||
#$dbh->do("UNLOCK TABLES");
|
||||
}
|
||||
elsif ($status eq "processed") {
|
||||
print LOG "Not processing log $File::Find::name; already processed.\n";
|
||||
return;
|
||||
}
|
||||
elsif ($status eq "processing") {
|
||||
print LOG "Not processing log $File::Find::name; already being processed.\n";
|
||||
return;
|
||||
}
|
||||
|
||||
print LOG "Processing $File::Find::name.\n";
|
||||
|
||||
$update_log_sth->execute("processing", $log_id) || die $dbh->errstr;
|
||||
|
||||
if ($logfile =~ /\.gz$/) {
|
||||
open(LOGFILE, "gunzip -c $File::Find::name |")
|
||||
or die "Couldn't open gzipped file for reading: $!";
|
||||
}
|
||||
else {
|
||||
open(LOGFILE, "< $File::Find::name")
|
||||
or die "Couldn't open file for reading: $!";
|
||||
}
|
||||
while (<LOGFILE>) {
|
||||
# Periodically print out a message about our progress
|
||||
# so users know if the script has frozen or is going slowly.
|
||||
++$seen;
|
||||
++$log_seen;
|
||||
print LOG "Processed $log_entered/$log_seen entries for $relative_path/$logfile ($entered/$seen total).\n"
|
||||
if ($seen % 1000 == 0) && $verbose;
|
||||
|
||||
if ($File::Find::name =~ /\.http_trans$/) {
|
||||
($date_time, $client, $bytes, $file, $status, $user_agent, $referer)
|
||||
= ($_ =~ $aol_log_regex);
|
||||
$method = $protocol = $protocol_version = undef;
|
||||
}
|
||||
else {
|
||||
($client, $date_time, $method, $file, $protocol, $protocol_version,
|
||||
$status, $bytes, $referer, $user_agent) = ($_ =~ $common_log_regex);
|
||||
}
|
||||
#print LOG "$client, $date_time, $method, $file, $protocol, $protocol_version, $status, $bytes, $referer, $user_agent\n";
|
||||
|
||||
# Count only successful requests (whether partial or full).
|
||||
next unless $status == 200 || $status == 206;
|
||||
|
||||
# Split up the file string into a path and a name.
|
||||
$file =~ /^(.*)\/([^\/]*)$/;
|
||||
($path, $filename) = ($1, $2);
|
||||
|
||||
# Only deal with releases, webtools, and language packs at this point.
|
||||
next if $path !~ /releases/ && $path !~ /webtools/ && $path !~ /mozilla\/l10n\/lang/;
|
||||
|
||||
# Strip the URL query string, if any, from the filename.
|
||||
$filename = (split(/\?/, $filename))[0];
|
||||
|
||||
# Don't bother storing directory accesses, since we don't do anything with them.
|
||||
next if !$filename;
|
||||
|
||||
# Get the file's unique ID or create a record for it if none exists yet.
|
||||
$file_id = $files{$file};
|
||||
if (!$file_id) {
|
||||
($file_id) = $dbh->selectrow_array($get_file_id_sth, {}, $path, $filename);
|
||||
if ($file_id) { $files{$file} = $file_id }
|
||||
else {
|
||||
$file_id = ++$max_file_id;
|
||||
$insert_file_sth->execute($file_id, $path, $filename || undef);
|
||||
}
|
||||
}
|
||||
|
||||
# Convert the timestamp into MySQL's format (including folding the timezone
|
||||
# into the time to convert it to local time, since MySQL DATETIME types
|
||||
# don't store timezone information).
|
||||
$date_time = strftime("%Y/%m/%d %H:%M:%S", localtime(str2time($date_time)));
|
||||
|
||||
if ($DO_REVERSE_DNS_LOOKUPS) {
|
||||
# Do a reverse DNS lookup to get the domain name from the IP address.
|
||||
$host = $hosts{$client};
|
||||
if (!$host) {
|
||||
if ($client =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {
|
||||
$host = gethostbyaddr(inet_aton($client), AF_INET) || $client;
|
||||
}
|
||||
else {
|
||||
$host = $client;
|
||||
}
|
||||
$hosts{$client} = $host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$host = $client;
|
||||
}
|
||||
#print LOG "$client = $host\n";
|
||||
|
||||
# Insert the log entry into the database. We increment
|
||||
# the ID so this entry has the next unique ID, and we make
|
||||
# the filename be NULL if it doesn't exist because that's
|
||||
# easier for queries to understand than a blank string.
|
||||
++$entry_id;
|
||||
++$entered;
|
||||
++$log_entered;
|
||||
$insert_entry_sth->execute($entry_id, $protocol, $protocol_version, $host,
|
||||
$date_time, $method, $file_id, $status, $bytes,
|
||||
$site_id, $log_id);
|
||||
}
|
||||
close(LOGFILE);
|
||||
$update_log_sth->execute("processed", $log_id) || die $dbh->errstr;
|
||||
}
|
Загрузка…
Ссылка в новой задаче