#!/local/bin/perl5 # Copyright (c) 1998 University of Southern California. # All rights reserved. # # Redistribution and use in source and binary forms are permitted # provided that the above copyright notice and this paragraph are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed by the University of # Southern California, Information Sciences Institute. The name of the # University may not be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # Called with three arguments: # # # Where: # is a file that consists of a SQUID proxy # log file, which is sorted by URL name. # # gets the output from analysing each URL hit # creating a hit-matrix line for that URL. The format of # a hit-matrix line is: # # ... where # # is the UNIX time in seconds since 1/1/1970 GMT # consists of the string "interval.hits" where # interval represents the number of 15-minute # intervals since and hits is the number # of hits during that interval. # # is a one-line file that keeps track of # where # # assumes that hmat.pl is being run # across a sequence of daily log files, # is the sum of all the byte counts of files # retrieved and is a count of all the # files retrieved. sub numeric { $a <=> $b; } $interval = int 15 * 60; # 15 minutes in seconds if ( !(defined ($ARGV[0])) || !(-r $ARGV[0]) || !(defined ($ARGV[1])) || !(defined ($ARGV[2])) ) { print "\a\nRequires three arguments.\n"; print "\t1st argument: a sorted SQUID proxy log file.\n"; print "\t2nd argument: an output hit-matrix file.\n"; print "\t3nd argument: an output summary file.\n"; exit (0); } # # Data structure read in contains entries of the form: # Key: Value: [ ... ] # # Where is the URL name, is the UNIX time in seconds # since Jan 1, 1970 UTC, is its size in bytes # and is the number of times this URL has been seen in the # log file during a 15 minute interval. The index to access # values will vary from 2..97 for an entire day. # open (INFILE,"<$ARGV[0]"); open (OUTFILE,">$ARGV[1]"); $bytetotal = 0; $hittotal = 0; @timestamps = (); $url = ""; LINE: while () { # trim leading whitespace split; # split line into words # If this line contains a new URL, write out the current # ... record if ($_[4] ne $url) { if (@timestamps) # Don't print if nothing there { @sorted = sort numeric @timestamps; $initialdate = $sorted[0]; foreach $elem (@sorted) { $index = int ($elem - $initialdate) / $interval; $hits[$index]++; } print OUTFILE "$url $initialdate $size "; for ($index = 0; $index <= $#hits; $index++) { if (defined ($hits[$index])) { print OUTFILE $index, ".", $hits[$index], " "; } } print OUTFILE "\n"; } # Set up for a new URL. @hits = (); @timestamps = (); $url = $_[4]; $size = 0; } # Filter out those log lines that are not satisfiable. # Accumulate total byte and hit counts. Keep track of all # timestamps for this URL. @logtag = split /\//, $_[2]; if ($logtag[0] eq "TCP_DENIED") { next LINE; } else { $hittotal++; $bytetotal += $_[3]; if ($_[3] > $size) { $size = $_[3]; } push @timestamps, $_[0]; } if (!(int $hittotal % 1000)) { print "$hittotal\n"; } } # print out final URL hit-matrix entry. if (@timestamps) # Don't print if nothing there. { @sorted = sort numeric @timestamps; $initialdate = $sorted[0]; foreach $elem (@sorted) { $index = int ($elem - $initialdate) / $interval; $hits[$index]++; } print OUTFILE "$url $initialdate $size "; for ($index = 0; $index <= $#hits; $index++) { if (defined ($hits[$index])) { print OUTFILE $index, ".", $hits[$index], " "; } } print OUTFILE "\n"; } close (INFILE); close (OUTFILE); %summary = (); if ((-r $ARGV[2])) { open (SUMMARYFILE,"<$ARGV[2]"); $_ = ; split; $summary{"days"} = $_[0]; $summary{"bytes"} = $_[1]; $summary{"hits"} = $_[2]; close (SUMMARYFILE); } $summary{"days"}++; $summary{"bytes"} += $bytetotal; $summary{"hits"} += $hittotal; open (SUMMARYFILE,">$ARGV[2]"); print SUMMARYFILE $summary{"days"}, " ", $summary{"bytes"}, " ", $summary{"hits"}, "\n";; close (SUMMARYFILE); print "\nLog files summarized: ", $summary{"days"}, "\n"; print "This log file's total #bytes: $bytetotal\n"; print "This log file's total #hits: $hittotal\n"; exit (0);