#!/local/bin/perl5 # Copyright (c) 1998 University of Southern California. # All rights reserved. # # Redistribution and use in source and binary forms are permitted # provided that the above copyright notice and this paragraph are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed by the University of # Southern California, Information Sciences Institute. The name of the # University may not be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # Accepts three arguments: # # # Where: # is a hit-matrix file that is sorted # by URL. Usually, this is the result of concatenating # smaller individual daily hit-matrix files. # # is the combined hit-matrix file that # contains one line per URL. Each line may be quite long. # # A hit-matrix file line consists of: # # ... where # # is the UNIX time in seconds since 1/1/1970 GMT # consists of the string "interval.hits" where # interval represents the number of 15-minute # intervals since and hits is the number # of hits during that interval. # # is a one-line file produced by hmat.pl that keeps track of # where # # assumes that hmat.pl is being run # across a sequence of daily log files, # is the sum of all the byte counts of files # retrieved and is a count of all the # files retrieved. # # To those three fields is appended a field # that keeps track of the number of unique URLs seen # in the combined hit-matrix file by cmbhmat.pl. # # NOTE: The URLs ftp://foo.bar and http://foo.bar # will be counted as two URLs. This does not count # unique underlying files, although the difference # should be insignificant. sub numeric { $a <=> $b; } $interval = int 15 * 60; # 15 minutes in seconds if ( !(defined ($ARGV[0])) || !(-r $ARGV[0]) || !(defined ($ARGV[1])) || !(defined ($ARGV[2])) ) { print "\a\nRequires three arguments.\n"; print "\t1st argument: a sorted hit-matrix file.\n"; print "\t2nd argument: an output hit-matrix file.\n"; print "\t3rd argument: an output summary file.\n"; exit (0); } open (INFILE,"<$ARGV[0]"); open (OUTFILE,">$ARGV[1]"); @timestamps = (); $url = ""; $urltotal = 0; LINE: while () { # trim leading whitespace split; # split line into words # If this line contains a new URL, write out # the current combined hit-matrix line. if ($_[0] ne $url) { if (@timestamps) # Don't print if nothing there { @sorted = sort numeric @timestamps; # Find oldest date. $initialdate = $sorted[0]; foreach $line (@hmlines) { shift @$line; $interval_diff = int ($$line[0] - $initialdate) / $interval; shift @$line; shift @$line; # Add the fields from each hit-matrix # line for this URL into the combined hit-matrix line. foreach $elem (@$line) { @entry = split /\./, $elem; $index = int ($entry[0] + $interval_diff); $chmline{$index} += $entry[1]; } } print OUTFILE "$url $initialdate $size "; foreach $val (sort numeric keys %chmline) { print OUTFILE "$val.$chmline{$val} "; } print OUTFILE "\n"; if (!(int ++$urltotal % 1000)) { print "$urltotal\n"; } } # Set up for a new URL. @hmlines = (); %chmline = (); $url = $_[0]; $size = 0; @timestamps = (); } # Copy the hit-matrix line and it's timestamp. if ($_[2] > $size) { $size = $_[2]; } push @hmlines, [ @_ ]; push @timestamps, $_[1]; } # print out final URL hit-matrix entry. if (@timestamps) # Don't print if nothing there. { @sorted = sort numeric @timestamps; $initialdate = $sorted[0]; foreach $line (@hmlines) { shift @$line; $interval_diff = int ($$line[0] - $initialdate) / $interval; shift @$line; shift @$line; foreach $elem (@$line) { @entry = split /\./, $elem; $index = int ($entry[0] + $interval_diff); $chmline{$index} += $entry[1]; } } print OUTFILE "$url $initialdate $size "; foreach $val (sort numeric keys %chmline) { print OUTFILE "$val.$chmline{$val} "; } print OUTFILE "\n"; if (!(int ++$urltotal % 1000)) { print "$urltotal\n"; } } close (INFILE); close (OUTFILE); %summary = (); if ((-r $ARGV[2])) { open (SUMMARYFILE,"<$ARGV[2]"); $_ = ; split; $summary{"days"} = $_[0]; $summary{"bytes"} = $_[1]; $summary{"hits"} = $_[2]; close (SUMMARYFILE); } $summary{"urls"} += $urltotal; open (SUMMARYFILE,">$ARGV[2]"); print SUMMARYFILE $summary{"days"}, " ", $summary{"bytes"}, " ", $summary{"hits"}, " ", $summary{"urls"}, "\n";; close (SUMMARYFILE); print "\nHit-matrix files summarized: ", $summary{"days"}, "\n"; print "Total #bytes: ", $summary{"bytes"}, "\n"; print "Total #hits: ", $summary{"hits"}, "\n"; print "Total #urls: ", $summary{"urls"}, "\n\n"; exit (0);