#!/local/bin/perl5 # Copyright (c) 1998 University of Southern California. # All rights reserved. # # Redistribution and use in source and binary forms are permitted # provided that the above copyright notice and this paragraph are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed by the University of # Southern California, Information Sciences Institute. The name of the # University may not be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # Called with three arguments: # # # Where: # # -- An intermediate log file that is sorted by URL. # # -- A filtered intermediate log file. # # -- Intermediate log file lines dropped by filtering. # # The program processes a daily SQUID proxy log file that has been sorted # on the URL string field. It filters that log, removing records for # which the client and URL fields are not unique. The # mimics what the proxy log file should contain if every client had a # perfect, operating web cache that was flushed at the end of the day. # The contains those log lines that were filtered out. # The number of records in the plus the number in the # should equal the number in the input . # if ( !(defined ($ARGV[0])) || !(-r $ARGV[0]) || !(defined ($ARGV[1])) || !(defined ($ARGV[2])) ) { print "\a\nRequires three arguments.\n"; print "\t1st argument: sorted Squid proxy log file.\n"; print "\t2nd argument: filtered Squid proxy log file.\n"; print "\t3rd argument: file of Squid lines dropped by filtering.\n"; exit (0); } open (INFILE,"<$ARGV[0]"); open (FILTERFILE,">$ARGV[1]"); open (DROPFILE,">$ARGV[2]"); $loglines = 0; $filterlines = 0; $droplines = 0; %ccache = (); $url = ""; LINE: while () { split; # Skip past bad lines. Good lines have at least five fields. next LINE if ($#_ < 4); $loglines++; # Check whether or not we have encountered a new URL. # If we have, print out the group of clients that # referenced the last URL and reinitialize for this new URL. if ($url ne $_[4]) { foreach $client (keys %ccache) { print FILTERFILE "@{$ccache{$client}}\n"; } %ccache = (); $url = $_[4]; } # If we have not encountered this client associated with this # URL, create a record for this client. Otherwise, for a client # that we have already seen, keep the oldest URL reference. if ( !defined($ccache{$_[1]}) ) { $filterlines++; $ccache{$_[1]} = [@_]; } else { $droplines++; if ($ccache{$_[1]}->[0] > $_[0]) { print DROPFILE "@{$ccache{$_[1]}}\n"; $ccache{$_[1]} = [@_]; } else { print DROPFILE "@_\n"; } } } # Finished reading input log file. Print out last # group of clients that referenced the last URL. foreach $client (keys %ccache) { print FILTERFILE "@{$ccache{$client}}\n"; } close (INFILE); close (FILTERFILE); close (DROPFILE); if (!$loglines) { print "\n\aIn ccachefilter.pl, no good log lines seen.\n\n"; exit (1); } print "\nFiltered log lines: $filterlines\n"; print "Dropped log lines: $droplines\n"; printf "Ratio dropped to total log lines: %0.3f\n", $droplines/$loglines; exit(0);