pjs/tools/download-stats/process-logs.pl

#!/usr/bin/perl

################################################################################
# script initialization

use strict; # protect us from ourselves
use DBI; # database stuff
use Date::Parse;
use POSIX qw(strftime);
use Socket; # DNS queries
use File::Find; # grabbing the list of log files from the filesystem
#use Fcntl ':flock'; # import LOCK_* constants for locking log files

# XXX Should use File::Basename for splitting paths up into paths and filenames.

# XXX This stuff should go into a config file.
my $root_dir = "/data/ftp-logs";
my $sites = "aol|erlangen|gatech|indiana|isc|oregonstate|rediris|scarlet|utah"; # XXX Should be generated by a database query.
my $verbose = 1;
my $DO_REVERSE_DNS_LOOKUPS = 0;

chdir $root_dir
  or die "Can't change to root dir $root_dir: $!";

# Figure out what period of time to process the logs from.  We use
# a file's "modification time" attribute to store the most recent time
# at which logs were processed, and we process logs between that time
# and the present (i.e. from last processed time + 1 to the current time).
#my $timestamp_file = "$root_dir/last-processed";
#if (!-e $timestamp_file) {
#    # Create the timestamp file and give it a timestamp way in the past.
#    my $status = system("touch", "-t197001010000", $timestamp_file);
#    if ($status != 0) { die "Couldn't touch $timestamp_file: $!" }
#}
#my ($read_time, $last_processed_time) = (stat($timestamp_file))[8,9];
#my $start_time = $last_processed_time + 1;
#my $end_time = time;
#utime($read_time, $end_time, $timestamp_file) 
#  or die "Can't update timestamp on $timestamp_file: $!";

# Regular expressions that grab data from the log entries; pre-defined
# and pre-compiled here for performance.  The backslash in [^\"]
# isn't necessary for Perl but fixes indenting confusion in emacs.
my $common_log_regex = qr/^(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)] "(\S+) (.*?) (\S+)\/(\S+)" (\S+) (\S+) "([^\"]*)" "([^\"]*)"/o;
my $aol_log_regex = qr/(\w{3} \w{3} \d\d \d\d:\d\d:\d\d \d{4}) \d+ (\S+) (\d+) (.*?) (\S+) "([^\"]*)" "([^\"]*)"/o;

################################################################################
# database and query configuration

# Establish a database connection.
my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";
my $dbh = DBI->connect($dsn,
                       "logprocessord",
                       "1ssw?w?",
                       { RaiseError => 1,
                         PrintError => 0,
                         ShowErrorStatement => 1 }
                      );

# Prepare the statements we're going to use to insert HTTP log entries into
# the database.
my $insert_entry_sth = $dbh->prepare("INSERT INTO entries (id, protocol, protocol_version, 
                                      client, date_time, method, file_id, status, bytes, 
                                      query_id, site_id, log_id)
                                      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
my $insert_file_sth = $dbh->prepare("INSERT INTO files (id, path, name) VALUES (?, ?, ?)");
my $get_file_id_sth = $dbh->prepare("SELECT id FROM files WHERE path = ? AND name = ?");
my $get_log_status_sth = $dbh->prepare("SELECT id, status FROM logs WHERE path = ? AND name = ?");
my $insert_log_sth = $dbh->prepare("INSERT INTO logs (id, path, name, site_id, status) VALUES (?, ?, ?, ?, ?)");
my $update_log_sth = $dbh->prepare("UPDATE logs SET status = ? WHERE id = ?");
my $get_site_id_sth = $dbh->prepare("SELECT id FROM sites WHERE abbr = ?");
my $get_query_id_sth = $dbh->prepare("SELECT id FROM queries WHERE query = ?");
my $insert_query_sth = $dbh->prepare("INSERT INTO queries (id, query) VALUES (?, ?)");

# Get the last unique ID from the database so we know what the next one 
# should be. XXX These assume only one script process will ever be running
# at a time, which is an unsafe assumption; fix this by locking the tables
# in question whenever a new entry is to be inserted and then running
# these queries to get the maximum IDs.  Note that locking could be expensive,
# so perhaps it's better just to lock everything at the beginning and not let
# a second process access the database at all.
my ($entry_id) = $dbh->selectrow_array("SELECT MAX(id) FROM entries") || 0;
my ($max_file_id) = $dbh->selectrow_array("SELECT MAX(id) FROM files") || 0;
my ($max_query_id) = $dbh->selectrow_array("SELECT MAX(id) FROM queries") || 0;

my $seen = 0;
my $entered = 0;

my ($client, $date_time, $method, $file, $protocol, $protocol_version, 
    $status, $bytes, $referer, $user_agent, $host, $file_id, $path, $filename);

my %hosts;
my %files;


################################################################################
# main body

find(\&process_log, $root_dir);

################################################################################
# functions

sub process_log {
    # Processes a log file, inserting relevant entries into the database.
    # Called from File::Find::find with $_ containing the filename, 
    # $File::Find::dir containing the path, and $File::Find::name 
    # containing the path + name.

    my $log_seen = 0;
    my $log_entered = 0;

    $File::Find::name && $File::Find::dir && $_ 
      or die "process_log() called without name of file: $File::Find::name\n";

    my $logfile = $_;
    $File::Find::dir =~ m|^$root_dir/(.*)$|;
    my $relative_path = $1 || $File::Find::dir;

    # Don't process the file if it's a directory.
    if (-d $logfile) {
	print "Not processing $File::Find::name; directory\n";
	return;
    }

    # Don't process the file if it isn't a log file.
    # XXX This test may be too brittle, assuming a certain directory
    # and file structure.  It does, however, deal with HTTP logs
    # which aren't in an http/ subdirectory of the site directory.
    if ($File::Find::name !~ m!^$root_dir/($sites)/(http/|trillian/)?$logfile$!) {
	print "Not processing $File::Find::name; not an HTTP log file.\n";
	return;
    }

    # Grab the site's unique ID from the sites table.
    my $site = $1;
    my ($site_id) = $dbh->selectrow_array($get_site_id_sth, {}, $site);
    if (!$site_id) {
	print "Not processing $File::Find::name; couldn't find an entry " . 
	             "in the sites table for $site.\n";
	return;
    }

    # The name of the log file without any suffix indicating compression.
    # This script treats foo.log and foo.log.gz as the same file so we can
    # compress logs after processing them and they won't get reprocessed.
    # Note that the "name" column in the "logs" table stores the base name
    # of the log file, not the actual name.
    my $basename = $logfile;
    $basename =~ s/\.(gz|bz2)$//;

    # Get the log file's unique ID and status from the database.
    my ($log_id, $status) = 
	$dbh->selectrow_array($get_log_status_sth, {}, $relative_path, $basename);

    if (!$log_id) {
	print "Creating entry in database for log $File::Find::name.\n";
	#$dbh->do("LOCK TABLES logs WRITE");
	($log_id) = $dbh->selectrow_array("SELECT MAX(id) FROM logs");
	$log_id = ($log_id || 0) + 1;
	$insert_log_sth->execute($log_id, $relative_path, $basename, $site_id, "new");
	#$dbh->do("UNLOCK TABLES");
    }
    elsif ($status eq "processed") {
	print "Not processing log $File::Find::name; already processed.\n";
	return;
    }
    elsif ($status eq "processing") {
	print "Not processing log $File::Find::name; already being processed.\n";
	return;
    }

    print "Processing $File::Find::name.\n";

    $update_log_sth->execute("processing", $log_id) || die $dbh->errstr;

    if ($logfile =~ /\.gz$/) {
	open(LOGFILE, "gunzip -c $File::Find::name |")
	  or die "Couldn't open gzipped file $File::Find::name for reading: $!";
    }
    elsif ($logfile =~ /\.bz2$/) {
	open(LOGFILE, "bunzip2 -c $File::Find::name |")
	  or die "Couldn't open bzip2ed file $File::Find::name for reading: $!";
    }
    else {
	open(LOGFILE, "< $File::Find::name")
	  or die "Couldn't open file for reading: $!";
    }
    while (<LOGFILE>) {
        # Periodically print out a message about our progress
        # so users know if the script has frozen or is going slowly.
        ++$seen;
        ++$log_seen;
        print "Processed $log_entered/$log_seen entries for $relative_path/$logfile ($entered/$seen total).\n"
	    if ($seen % 1000 == 0) && $verbose;

	if ($File::Find::name =~ /\.http_trans$/) {
	    ($date_time, $client, $bytes, $file, $status, $user_agent, $referer)
		= ($_ =~ $aol_log_regex);
	    $method = $protocol = $protocol_version = undef;
	}
	else {
	    ($client, $date_time, $method, $file, $protocol, $protocol_version, 
             $status, $bytes, $referer, $user_agent) = ($_ =~ $common_log_regex);
	}
	#print "$client, $date_time, $method, $file, $protocol, $protocol_version, $status, $bytes, $referer, $user_agent\n";

        # Count only successful requests (whether partial or full).
        next unless $status == 200 || $status == 206;

        # Strip the URL query string, if any, from the file string.
        ($file, my $query) = (split(/\?/, $file));

	my $query_id;
	if ($query) {
	    ($query_id) = $dbh->selectrow_array($get_query_id_sth, {}, $query);
	    if (!$query_id) {
	        $query_id = ++$max_query_id;
	        $insert_query_sth->execute($query_id, $query);
	    }
	}

        # Split up the file string into a path and a name.
        $file =~ /^(.*)\/([^\/]*)$/;
        ($path, $filename) = ($1, $2);

        # Only deal with releases, extensions, webtools, and language packs at this point.
	next if $path !~ /releases/ && $path !~ /extensions/ && $path !~ /webtools/ && $path !~ /mozilla\/l10n\/lang/;

        # Don't bother storing directory accesses, since we don't do anything with them.
	next if !$filename;

        # Get the file's unique ID or create a record for it if none exists yet.
        $file_id = $files{$file};
        if (!$file_id) {
            ($file_id) = $dbh->selectrow_array($get_file_id_sth, {}, $path, $filename);
	    if ($file_id) { $files{$file} = $file_id }
            else {
	        $file_id = ++$max_file_id;
	        $insert_file_sth->execute($file_id, $path, $filename || undef);
            }
        }

        # Convert the timestamp into MySQL's format (including folding the timezone
        # into the time to convert it to local time, since MySQL DATETIME types
        # don't store timezone information).
        $date_time = strftime("%Y/%m/%d %H:%M:%S", localtime(str2time($date_time)));

        if ($DO_REVERSE_DNS_LOOKUPS) {
	    # Do a reverse DNS lookup to get the domain name from the IP address.
	    $host = $hosts{$client};
            if (!$host) {
		if ($client =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {
		    $host = gethostbyaddr(inet_aton($client), AF_INET) || $client;
		}
		else {
		    $host = $client;
		}
		$hosts{$client} = $host;
            }
        }
	else {
	    $host = $client;
	}
	#print "$client = $host\n";

        # Insert the log entry into the database.  We increment
        # the ID so this entry has the next unique ID, and we make
        # the filename be NULL if it doesn't exist because that's
        # easier for queries to understand than a blank string.
        ++$entry_id;
        ++$entered;
        ++$log_entered;
        $insert_entry_sth->execute($entry_id, $protocol, $protocol_version, $host, 
                                   $date_time, $method, $file_id, $status, $bytes, 
				   $query_id, $site_id, $log_id);
    }
    close(LOGFILE);
    $update_log_sth->execute("processed", $log_id) || die $dbh->errstr;
}
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`#!/usr/bin/perl`

			`################################################################################`
			`# script initialization`

			`use strict; # protect us from ourselves`
			`use DBI; # database stuff`
			`use Date::Parse;`
			`use POSIX qw(strftime);`
			`use Socket; # DNS queries`
			`use File::Find; # grabbing the list of log files from the filesystem`
			`#use Fcntl ':flock'; # import LOCK_* constants for locking log files`

Don't assume we're in the root dir and intensify some comments about what we should be doing. 2004-05-22 20:43:20 +04:00			`# XXX Should use File::Basename for splitting paths up into paths and filenames.`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
Don't assume we're in the root dir and intensify some comments about what we should be doing. 2004-05-22 20:43:20 +04:00			`# XXX This stuff should go into a config file.`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`my $root_dir = "/data/ftp-logs";`
deal with funky directory for gatech logs 2004-06-17 03:17:41 +04:00			`my $sites = "aol\|erlangen\|gatech\|indiana\|isc\|oregonstate\|rediris\|scarlet\|utah"; # XXX Should be generated by a database query.`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`my $verbose = 1;`
			`my $DO_REVERSE_DNS_LOOKUPS = 0;`

Don't assume we're in the root dir and intensify some comments about what we should be doing. 2004-05-22 20:43:20 +04:00			`chdir $root_dir`
			`or die "Can't change to root dir $root_dir: $!";`

initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`# Figure out what period of time to process the logs from. We use`
			`# a file's "modification time" attribute to store the most recent time`
			`# at which logs were processed, and we process logs between that time`
			`# and the present (i.e. from last processed time + 1 to the current time).`
			`#my $timestamp_file = "$root_dir/last-processed";`
			`#if (!-e $timestamp_file) {`
			`# # Create the timestamp file and give it a timestamp way in the past.`
			`# my $status = system("touch", "-t197001010000", $timestamp_file);`
			`# if ($status != 0) { die "Couldn't touch $timestamp_file: $!" }`
			`#}`
			`#my ($read_time, $last_processed_time) = (stat($timestamp_file))[8,9];`
			`#my $start_time = $last_processed_time + 1;`
			`#my $end_time = time;`
			`#utime($read_time, $end_time, $timestamp_file)`
			`# or die "Can't update timestamp on $timestamp_file: $!";`

			`# Regular expressions that grab data from the log entries; pre-defined`
			`# and pre-compiled here for performance. The backslash in [^\"]`
			`# isn't necessary for Perl but fixes indenting confusion in emacs.`
			`my $common_log_regex = qr/^(\S+) \S+ \S+ \[([^:]+:\d+:\d+:\d+ [^\]]+)] "(\S+) (.?) (\S+)\/(\S+)" (\S+) (\S+) "([^\"])" "([^\"]*)"/o;`
			`my $aol_log_regex = qr/(\w{3} \w{3} \d\d \d\d:\d\d:\d\d \d{4}) \d+ (\S+) (\d+) (.?) (\S+) "([^\"])" "([^\"]*)"/o;`

			`################################################################################`
			`# database and query configuration`

			`# Establish a database connection.`
			`my $dsn = "DBI:mysql:host=mecha.mozilla.org;database=downloadstats;port=3306";`
			`my $dbh = DBI->connect($dsn,`
			`"logprocessord",`
			`"1ssw?w?",`
			`{ RaiseError => 1,`
			`PrintError => 0,`
			`ShowErrorStatement => 1 }`
			`);`

			`# Prepare the statements we're going to use to insert HTTP log entries into`
			`# the database.`
			`my $insert_entry_sth = $dbh->prepare("INSERT INTO entries (id, protocol, protocol_version,`
start storing the query string in the database 2004-06-30 05:14:45 +04:00			`client, date_time, method, file_id, status, bytes,`
			`query_id, site_id, log_id)`
			`VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`my $insert_file_sth = $dbh->prepare("INSERT INTO files (id, path, name) VALUES (?, ?, ?)");`
			`my $get_file_id_sth = $dbh->prepare("SELECT id FROM files WHERE path = ? AND name = ?");`
			`my $get_log_status_sth = $dbh->prepare("SELECT id, status FROM logs WHERE path = ? AND name = ?");`
			`my $insert_log_sth = $dbh->prepare("INSERT INTO logs (id, path, name, site_id, status) VALUES (?, ?, ?, ?, ?)");`
			`my $update_log_sth = $dbh->prepare("UPDATE logs SET status = ? WHERE id = ?");`
			`my $get_site_id_sth = $dbh->prepare("SELECT id FROM sites WHERE abbr = ?");`
start storing the query string in the database 2004-06-30 05:14:45 +04:00			`my $get_query_id_sth = $dbh->prepare("SELECT id FROM queries WHERE query = ?");`
			`my $insert_query_sth = $dbh->prepare("INSERT INTO queries (id, query) VALUES (?, ?)");`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`# Get the last unique ID from the database so we know what the next one`
			`# should be. XXX These assume only one script process will ever be running`
			`# at a time, which is an unsafe assumption; fix this by locking the tables`
			`# in question whenever a new entry is to be inserted and then running`
			`# these queries to get the maximum IDs. Note that locking could be expensive,`
			`# so perhaps it's better just to lock everything at the beginning and not let`
			`# a second process access the database at all.`
			`my ($entry_id) = $dbh->selectrow_array("SELECT MAX(id) FROM entries") \|\| 0;`
			`my ($max_file_id) = $dbh->selectrow_array("SELECT MAX(id) FROM files") \|\| 0;`
start storing the query string in the database 2004-06-30 05:14:45 +04:00			`my ($max_query_id) = $dbh->selectrow_array("SELECT MAX(id) FROM queries") \|\| 0;`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`my $seen = 0;`
			`my $entered = 0;`

			`my ($client, $date_time, $method, $file, $protocol, $protocol_version,`
			`$status, $bytes, $referer, $user_agent, $host, $file_id, $path, $filename);`

			`my %hosts;`
			`my %files;`


			`################################################################################`
			`# main body`

			`find(\&process_log, $root_dir);`

			`################################################################################`
			`# functions`

			`sub process_log {`
			`# Processes a log file, inserting relevant entries into the database.`
			`# Called from File::Find::find with $_ containing the filename,`
			`# $File::Find::dir containing the path, and $File::Find::name`
			`# containing the path + name.`

			`my $log_seen = 0;`
			`my $log_entered = 0;`

			`$File::Find::name && $File::Find::dir && $_`
			`or die "process_log() called without name of file: $File::Find::name\n";`

			`my $logfile = $_;`
			`$File::Find::dir =~ m\|^$root_dir/(.*)$\|;`
			`my $relative_path = $1 \|\| $File::Find::dir;`

			`# Don't process the file if it's a directory.`
			`if (-d $logfile) {`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Not processing $File::Find::name; directory\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`return;`
			`}`

			`# Don't process the file if it isn't a log file.`
			`# XXX This test may be too brittle, assuming a certain directory`
			`# and file structure. It does, however, deal with HTTP logs`
			`# which aren't in an http/ subdirectory of the site directory.`
deal with funky directory for gatech logs 2004-06-17 03:17:41 +04:00			`if ($File::Find::name !~ m!^$root_dir/($sites)/(http/\|trillian/)?$logfile$!) {`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Not processing $File::Find::name; not an HTTP log file.\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`return;`
			`}`

			`# Grab the site's unique ID from the sites table.`
			`my $site = $1;`
			`my ($site_id) = $dbh->selectrow_array($get_site_id_sth, {}, $site);`
			`if (!$site_id) {`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Not processing $File::Find::name; couldn't find an entry " .`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`"in the sites table for $site.\n";`
			`return;`
			`}`

Treat compressed files as if they were the same as their uncompressed versions so that we can compress files and not have them be reprocessed. Also, handle bzip2ed files. 2004-05-22 18:09:58 +04:00			`# The name of the log file without any suffix indicating compression.`
			`# This script treats foo.log and foo.log.gz as the same file so we can`
			`# compress logs after processing them and they won't get reprocessed.`
			`# Note that the "name" column in the "logs" table stores the base name`
			`# of the log file, not the actual name.`
			`my $basename = $logfile;`
			`$basename =~ s/\.(gz\|bz2)$//;`

initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`# Get the log file's unique ID and status from the database.`
			`my ($log_id, $status) =`
Treat compressed files as if they were the same as their uncompressed versions so that we can compress files and not have them be reprocessed. Also, handle bzip2ed files. 2004-05-22 18:09:58 +04:00			`$dbh->selectrow_array($get_log_status_sth, {}, $relative_path, $basename);`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`if (!$log_id) {`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Creating entry in database for log $File::Find::name.\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`#$dbh->do("LOCK TABLES logs WRITE");`
			`($log_id) = $dbh->selectrow_array("SELECT MAX(id) FROM logs");`
			`$log_id = ($log_id \|\| 0) + 1;`
Treat compressed files as if they were the same as their uncompressed versions so that we can compress files and not have them be reprocessed. Also, handle bzip2ed files. 2004-05-22 18:09:58 +04:00			`$insert_log_sth->execute($log_id, $relative_path, $basename, $site_id, "new");`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`#$dbh->do("UNLOCK TABLES");`
			`}`
			`elsif ($status eq "processed") {`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Not processing log $File::Find::name; already processed.\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`return;`
			`}`
			`elsif ($status eq "processing") {`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Not processing log $File::Find::name; already being processed.\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`return;`
			`}`

print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Processing $File::Find::name.\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`$update_log_sth->execute("processing", $log_id) \|\| die $dbh->errstr;`

			`if ($logfile =~ /\.gz$/) {`
			`open(LOGFILE, "gunzip -c $File::Find::name \|")`
Treat compressed files as if they were the same as their uncompressed versions so that we can compress files and not have them be reprocessed. Also, handle bzip2ed files. 2004-05-22 18:09:58 +04:00			`or die "Couldn't open gzipped file $File::Find::name for reading: $!";`
			`}`
			`elsif ($logfile =~ /\.bz2$/) {`
			`open(LOGFILE, "bunzip2 -c $File::Find::name \|")`
			`or die "Couldn't open bzip2ed file $File::Find::name for reading: $!";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`}`
			`else {`
			`open(LOGFILE, "< $File::Find::name")`
			`or die "Couldn't open file for reading: $!";`
			`}`
			`while (<LOGFILE>) {`
			`# Periodically print out a message about our progress`
			`# so users know if the script has frozen or is going slowly.`
			`++$seen;`
			`++$log_seen;`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`print "Processed $log_entered/$log_seen entries for $relative_path/$logfile ($entered/$seen total).\n"`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`if ($seen % 1000 == 0) && $verbose;`

			`if ($File::Find::name =~ /\.http_trans$/) {`
			`($date_time, $client, $bytes, $file, $status, $user_agent, $referer)`
			`= ($_ =~ $aol_log_regex);`
			`$method = $protocol = $protocol_version = undef;`
			`}`
			`else {`
			`($client, $date_time, $method, $file, $protocol, $protocol_version,`
			`$status, $bytes, $referer, $user_agent) = ($_ =~ $common_log_regex);`
			`}`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`#print "$client, $date_time, $method, $file, $protocol, $protocol_version, $status, $bytes, $referer, $user_agent\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`# Count only successful requests (whether partial or full).`
			`next unless $status == 200 \|\| $status == 206;`

strip the query string before dividing the file into path and filename so that we don't divide on a slash in the query string by accident 2004-06-18 04:12:51 +04:00			`# Strip the URL query string, if any, from the file string.`
start storing the query string in the database 2004-06-30 05:14:45 +04:00			`($file, my $query) = (split(/\?/, $file));`

			`my $query_id;`
			`if ($query) {`
			`($query_id) = $dbh->selectrow_array($get_query_id_sth, {}, $query);`
			`if (!$query_id) {`
			`$query_id = ++$max_query_id;`
			`$insert_query_sth->execute($query_id, $query);`
			`}`
			`}`
strip the query string before dividing the file into path and filename so that we don't divide on a slash in the query string by accident 2004-06-18 04:12:51 +04:00
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`# Split up the file string into a path and a name.`
			`$file =~ /^(.)\/([^\/])$/;`
			`($path, $filename) = ($1, $2);`

collect stats for extensions as well 2004-06-18 03:02:17 +04:00			`# Only deal with releases, extensions, webtools, and language packs at this point.`
			`next if $path !~ /releases/ && $path !~ /extensions/ && $path !~ /webtools/ && $path !~ /mozilla\/l10n\/lang/;`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`# Don't bother storing directory accesses, since we don't do anything with them.`
			`next if !$filename;`

			`# Get the file's unique ID or create a record for it if none exists yet.`
			`$file_id = $files{$file};`
			`if (!$file_id) {`
			`($file_id) = $dbh->selectrow_array($get_file_id_sth, {}, $path, $filename);`
			`if ($file_id) { $files{$file} = $file_id }`
			`else {`
			`$file_id = ++$max_file_id;`
			`$insert_file_sth->execute($file_id, $path, $filename \|\| undef);`
			`}`
			`}`

			`# Convert the timestamp into MySQL's format (including folding the timezone`
			`# into the time to convert it to local time, since MySQL DATETIME types`
			`# don't store timezone information).`
			`$date_time = strftime("%Y/%m/%d %H:%M:%S", localtime(str2time($date_time)));`

			`if ($DO_REVERSE_DNS_LOOKUPS) {`
			`# Do a reverse DNS lookup to get the domain name from the IP address.`
			`$host = $hosts{$client};`
			`if (!$host) {`
			`if ($client =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {`
			`$host = gethostbyaddr(inet_aton($client), AF_INET) \|\| $client;`
			`}`
			`else {`
			`$host = $client;`
			`}`
			`$hosts{$client} = $host;`
			`}`
			`}`
			`else {`
			`$host = $client;`
			`}`
print informative messages to STDOUT instead of a log file. We can use standard shell redirection to put them into a log file if we want. 2004-05-22 18:14:16 +04:00			`#print "$client = $host\n";`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00
			`# Insert the log entry into the database. We increment`
			`# the ID so this entry has the next unique ID, and we make`
			`# the filename be NULL if it doesn't exist because that's`
			`# easier for queries to understand than a blank string.`
			`++$entry_id;`
			`++$entered;`
			`++$log_entered;`
			`$insert_entry_sth->execute($entry_id, $protocol, $protocol_version, $host,`
			`$date_time, $method, $file_id, $status, $bytes,`
start storing the query string in the database 2004-06-30 05:14:45 +04:00			`$query_id, $site_id, $log_id);`
initial checkin of download-stats tool for processing download logs into MySQL database and then generating stats from them 2004-05-14 01:41:14 +04:00			`}`
			`close(LOGFILE);`
			`$update_log_sth->execute("processed", $log_id) \|\| die $dbh->errstr;`
			`}`