extproc perl -x -S
#!/usr/bin/perl

# Parse PowerWeb HTTP log and calculate server statistics.
#
# countHTTP   Version 0.1  [10/12/1997]
#
# Copyright (C) 1997 by Ivan Adzhubei
#
# Web address:		http://www.protein.bio.msu.su/
# Send comments to:	ivan@protein.bio.msu.su

# Month numbers
%monthNum = (
  Jan => '01',
  Feb => '02',
  Mar => '03',
  Apr => '04',
  May => '05',
  Jun => '06',
  Jul => '07',
  Aug => '08',
  Sep => '09',
  Oct => '10',
  Nov => '11',
  Dec => '12',
);

# A few defaults
$logFile    = '/powerweb/logs/HTTP.log';
$localIP    = '127.0.0.1';
$localHost  = 'localhost';

# These are IP address range and domain name for the client requests which are
# excluded from all statistics calculations. Usefull to exclude your own hosts
# (producing lots of requests from your attempts to test/configure PW) from
# countHTTP results. Configurable via command line options -en/-ed, but you may
# place defaults for your net right here in the code to avoid typing excessively
# long command line each time you start countHTTP.
#
# *** UNCOMMENT AND EDIT THE TWO LINES BELOW ***
#
#$myNetwork  = '111.222.333';
#$myDomain   = 'mydomain.name.com';

# Parse command line options
while (($arg = shift @ARGV) && $arg =~ /^-/) {
  &printUsage,		exit if $arg =~ /^-(\?|h(elp)?)$/;
  $logFile     = $1,	next if $arg =~ /^-f(.+)/;
  $myNetwork   = $1,	next if $arg =~ /^-en(.+)?/;
  $myDomain    = $1,	next if $arg =~ /^-ed(.+)?/;
  $resolveName = 1,	if $arg =~ /n/;
  $printLoad   = 1,	if $arg =~ /l/;
  $printDocs   = 1,	if $arg =~ /d/;
  $printClients= 1,	if $arg =~ /c/;
  $strictHTML  = 1,	if $arg =~ /s/;
  $printTotals = 1,	if $arg =~ /t/;
  # Stop on -<date>, since we assume this is an ending date parameter.
  last if $arg =~ /^-\d+\/\d+\/\d+$/;
}

# Next arg is date range?
$arg =~ /^(\d+\/\d+\/\d+)?-(\d+\/\d+\/\d+)?$/;

# Yes, argument looks like a date range
if ($1 || $2) {
  $firstDate = $1; $lastDate = $2;
  if ($firstDate) {
    $firstDate =~ /(\d+)\/(\d+)\/(\d+)/;
    $firstYear = $3 + 0; $firstMonth = $2 + 0; $firstDay = $1 + 0;
    die "Illegal starting date format\n" if $firstMonth < 1 || $firstMonth > 12 ||
        $firstDay < 1 || $firstDay > 31;
    $packedFirst = $firstYear * 12 * 31 + ($firstMonth - 1) * 31 + $firstDay - 1;
  }
  if ($lastDate) {
    $lastDate =~ /(\d+)\/(\d+)\/(\d+)/;
    $lastYear = $3 + 0; $lastMonth = $2 + 0; $lastDay = $1 + 0;
    die "Illegal ending date format\n" if $lastMonth < 1 || $lastMonth > 12 ||
        $lastDay < 1 || $lastDay > 31;
    $packedLast  = $lastYear  * 12 * 31 + ($lastMonth  - 1) * 31 + $lastDay  - 1
  }
  $arg = '';
}

$arg = shift @ARGV if @ARGV;

open(LOG,$logFile) || die "Can't open file \"$logFile\"\n";

while (<LOG>) {
  chomp;
  next if /^\s*$/;

  /^(\S+)\s+(\S+)\s+(\S+)\s+\[(.+?)\]\s+"(.+?)"/;
  $client = $1; $server = $2; $auth = $3; $datetime = $4; $request = $5;
  print STDERR "WARNING! - Malformed log line:\n$_\n"
      unless $client && $server && $auth && $datetime && $request;

  $request =~ /^\w+\s+(\S+)/; $document = $1;
  $datetime =~ /^(\d+\/\w+\/\d+):([\d:]+)/; $date = $1; $time = $2;
  $date =~ s/\/(\w+)\//\/$monthNum{$1}\//;
  $date =~ /^(\d\d)\/(\d\d)\/(\d\d\d\d)/; $day = $1; $month = $2; $year = $3;
  $day += 0; $month += 0; $year += 0;
  $time =~ /^(\d\d):\d\d:\d\d/; $hour = $1 + 0;

  $firstLog = $date unless $firstLog;

  next if $client =~ /^$localIP/   || $client =~ /^$localHost$/;
  next if ($myNetwork && $client =~ /^$myNetwork/) ||
          ($myDomain  && $client =~ /$myDomain$/i);

  next if $strictHTML && $document !~ /^[^$&?=]*?\/(\w+(\.htm(l)?)?)?$/i;
  next if $strictHTML && $document =~ /\/(cgi-bin|perl-bin|rexx-bin)\//i;

  $packedDate  = $year * 12 * 31 + ($month - 1) * 31 + $day - 1
      if $firstDate || $lastDate;

  next if ($firstDate && $packedDate < $packedFirst) ||
          ($lastDate  && $packedDate > $packedLast);

  next if $arg && $document !~ /$arg/i;

  $totalRequests{$document}{$client}++;
  $totalLoad{$year}{$month}{$day}{$hour}++;

}

close(LOG);

print STDERR "Resolving client names" if $resolveName;
foreach $document (keys %totalRequests) {
  foreach $client (keys %{$totalRequests{$document}}) {
    print STDERR '.' if $resolveName;
    $clientName{$client} = '';
    unless (!$resolveName || $clientName{$client}) {
      $cliName = '';
      @lookupResults = `nslookup $client 2>nul`;
      foreach $line (@lookupResults) {
        chomp($line);
        $cliName = $1 if $line =~ /^Name:\s+(\S+)/;
      }
      $clientName{$client} = $cliName if $cliName;
    }
    $documentCount{$document} += $totalRequests{$document}{$client};
    $clientName{$client} = $client unless $clientName{$client};
    $clientCount{$clientName{$client}} += $totalRequests{$document}{$client};
    $totRequests += $totalRequests{$document}{$client};
  }
}
print STDERR "\n" if $resolveName;

if ($printDocs) {
  foreach $document (sort {$documentCount{$b}<=>$documentCount{$a}} keys %documentCount) {
    print "$document:\n" if $printDocs;
    unless ($printTotals) {
      foreach $client (sort {$totalRequests{$document}{$b}<=>$totalRequests{$document}{$a}} keys %{$totalRequests{$document}}) {
        printf "  %-36s%5s\n", $clientName{$client}, "($totalRequests{$document}{$client})";
      }
    }
    printf "  Subtotal:%32s\n", "($documentCount{$document})";
  }
}

if ($printClients) {
  foreach $client (sort {$clientCount{$b}<=>$clientCount{$a}} keys %clientCount) {
    printf "%-36s%5s\n", $client, "($clientCount{$client})";
  }
}

$firstDate = $firstLog unless $firstDate;
$lastDate  = $date     unless $lastDate;

print "\n------\nTotal ", $strictHTML ? 'HTML ' : '', "requests $firstDate - $lastDate";
print " for '$arg'" if $arg;
print ": $totRequests\n";

exit unless $printLoad;

foreach $year (sort keys %totalLoad) {
  foreach $month (sort keys %{$totalLoad{$year}}) {
    foreach $day (sort keys %{$totalLoad{$year}{$month}}) {
      foreach $hour (sort keys %{$totalLoad{$year}{$month}{$day}}) {
        $monthLoad{$year}{$month} += $totalLoad{$year}{$month}{$day}{$hour};
        $hourLoad{$hour} += $totalLoad{$year}{$month}{$day}{$hour};
        $hourList{$hour}++;
      }
    }
  }
}

for $hour (0..23) {
  $hourList{$hour}++ unless $hourList{$hour};
  $aveLoad = $hourLoad{$hour} / $hourList{$hour};
  $maxLoad = $aveLoad > $maxLoad ? $aveLoad : $maxLoad;
  $totLoad += $aveLoad;
  push @averageLoad, $aveLoad;
}

$maxLoad = $maxLoad / $totLoad * 100;

print "\n------\nServer daily load (percent/hour):\n\n";
for ($percent=int($maxLoad+0.5); $percent>0; $percent--) {
  printf("%5s",($percent % 5) ? '|' : "$percent-|");
  for $hour (0..23) {
    $percentLoad = $averageLoad[$hour] / $totLoad * 100;
    print (int($percentLoad+0.5) >= $percent ? '*' : '.');
  }
  print "\n";
}
print "  0-|", '-' x 24, "\n";
print "     |  |  |  |  |  |  |  |\n";
print "     0  3  6  9 12 15 18 21\n";

print "\n------\nServer load profile (hits/month):\n\n";
foreach $year (sort {$a<=>$b} keys %monthLoad) {
  foreach $month (sort {$a<=>$b} keys %{$monthLoad{$year}}) {
    printf "%d/%02d:%8d\n", $year, $month, $monthLoad{$year}{$month};
  }
}

sub printUsage {
print <<EOT
countHTTP for PowerWeb++ Server                       Version 0.1  [10/12/1997]

usage: countHTTP [options] [date_range] [tag_regexp]
  [date_range] is in form: dd/mm/yyyy-dd/mm/yyyy, either start or end date
            is optional, but '-' separator is mandatory;
  [tag_regexp] is Perl style regular expression, only matching documents are
            counted; must be protected from shell in usual way (eg. quoted).
  [options] are from among:
  -h(elp)   show (this) help screen; -? also works.
  -s        strict HTML mode, only requests ending in .htm(l) are counted,
            form GET/POST's, requests for graphics, etc., are ignored.
  -flogFile specify log file name (inc. path); default is to open HTTP.log
            in /powerweb/logs dir on the current drive.
  -enNET_IP exclude NET_IP range (eg. 111.222.333) from all statistics.
  -edDOMAIN exclude DOMAIN name (eg. mydomain.com) from all statistics;
            set both -e options to your own net/domain to exclude all of your
            own testing/configuration requests to server from statistics.
  -n        resolve and print client names instead of IP addresses; may take
            really LONG time for large log files with many requests.
  -l        print server load statistics: average daily and per month.
  -d        print document requests statistics: document hits per client.
  -c        print client statistics: total hits per each client.
  -t        print only totals/subtotals, less detailed output.
EOT
}
