#!/usr/local/bin/perl # # report.missing.pages.pl v1.0 Feb 23 1999 Sinclair Budd ( s.budd@ic.ac.uk) # # Produces a report of the missing pages and mails to the webmaster # of the server whose page has a link to the missing page. # # # # Usage: report.missing.pages.pl logfile reportfile webmastersfile # # (IN) logfile full path to file which a STDOUT from a htdig run has been directed. # eg from a command like htdig > /tmp/log # # (OUT) reportfile full path to a file where the program writes the information # about missing pages for which no webmaster is known. # # (IN) webmasterfile the full path to a file which contains the webmaster's # email addresses, one for each of the servers. # The webmasters appear one per line, the first entry is the # IP name of the web server and the second entry separated by # a space, is the email address of the servers webmaster. # e.g. www.test.com fred.bloggs@test.com # # (OUT) STDOUT a list of all missing web pages and the pages which reference them. # # ------------------------------------------------------------------------------------- # # Open files and initalize variables. #................................. # # Read in the webmasters and store in associative array # open( MASTERS ,@ARGV[2]) || die " Can't open webmasters file ,@ARGV[2] " ; while ( $line = ) { chomp $line ; ($serv,$mast ) = split(" ",$line); $masters{$serv} = $mast ; } print "Webmaster List \n" ; foreach $in ( keys %masters ) { print $in ," ", $masters{$in} ,"\n" ; } # Open the htdig STDOUT file # open(LOG, @ARGV[0] ) || die " Can't open the htdig log file @ARGV[0] " ; $i = 0 ; $previous_server = " "; # # .................................. # # Loop over the lines in the log file # and store the information on missing pages in the array tobesorted. # while ( $line = ) { next if $line !~ /^Not found/ ; chomp $line ; ($not,$found,$page,$ref,$frompage) = split(" ",$line); ($http,$nul,$server) = split("/",$frompage) ; ++$i ; $tobesorted[$i] = join(" ",$server,$frompage,$page); } # # .................................. # # Sort the list of not_found pages @sorted = sort @tobesorted ; # # .................................. # # Scan the sorted list and produce the report # foreach $in (@sorted) { ($server,$frompage,$page) = split(" ",$in); if ( $server ne $previous_server ) { if ( $masters{$server} eq "" ) { open ( MOUT, ">>@ARGV[1]") || die " Can't open the report file @ARGV[1] " ; } else { open ( MOUT , "| Mail -s missing_web_pages $masters{$server} " ) || die " Can't pipe to mailer "; } ; print "Server ",$server , " Webmaster ", $masters{$server} ," \n" ; print MOUT "Server ",$server , " Webmaster ", $masters{$server} ," \n" ; print " The first line of the pair is the URL of the page which references the missing page \n"; print MOUT " The first line of the pair is the URL of the page which references the missing page \n"; print " The second line is the URL of the missing page \n\n"; print MOUT " The second line is the URL of the missing page \n\n"; print " " ,$frompage ," Cant find \n ",$page, "\n\n"; print MOUT " " ,$frompage ," Cant find \n ",$page, "\n\n"; $previous_server = $server ; }; print " " ,$frompage ," Cant find \n ",$page, "\n\n"; print MOUT " " ,$frompage ," Cant find \n ",$page, "\n\n"; }