################################################################# # LogAnalyser # Version 0.3 (beta 1) # Copyright 2002, Oliver White # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ################################################################# #It isn't particularly fast (it analyses 47000 lines in 10 seconds) but #it does create a time-graph of hits for _every_ page, which can be useful! # #Beware, this program creates huge HTML files which may take a while for your #browser to load. These HTML files may be difficult to use with the web. # #Filenames of any logfiles to analyse my @Logfiles = ('logs/access_log', 'logs/access_log.0', 'logs/access_log.1'); my $BaseDomain = "www.blibbleblobble.co.uk"; my $Filename; my $Line; my $OkLines=0; my $FailedLines=0; my $Lines=0; $Version = "0.3 (beta 1)"; $ProgramName = "LogAnalyser"; #The stats themselves my %PageHitsByDate = (); # This will be a hash of dates, showing page hits my %PageHitsByMonth = (); # This will be a hash of months, showing page hits my %TotalHitsByPage = (); # This will be a hash of pagenames, showing hits my %PageGraph = (); #This will be a hash of hashes (2d table of page/day, showing page hits) my %Referrers = (); my %UserAgents = (); my %GraphicHits = (); my @IncomingLinksToGraphics; my %IPAddresses = (); my %Responses = (); my %NotFoundPages = (); my %SearchTerms = (); my %Months = ("Jan",1,"Feb",2,"Mar",3,"Apr",4,"May",5,"Jun",6,"Jul",7,"Aug",8,"Sep",9,"Oct",10,"Nov",11,"Dec",12); my $OutputFilename = "Logfile.htm"; my $Index = "\n\n\n

Index

\n"; my $StartTime = time(); DoAnalysis(); DisplayResults(); my $TimeTaken = time() - $StartTime; print "\nDone in $TimeTaken seconds"; print "\n$OkLines lines of the logfile were read and understood, $FailedLines lines were ignored"; sub DoAnalysis() { #Loop through each logfile foreach my $Filename(@Logfiles) { #Open the logfile open(INPUT, $Filename) || die ("\nCould not read file: $Filename"); #Read each line in turn foreach $Line() { #Analyse the line AnalyseLine($Line); } #close the logfile close INPUT; } } sub DisplayResults() { open (HTML,">$OutputFilename") || die("\nCould not write to $OutputFilename"); print HTML "\n\nLogfile stats\n\n\n\n

Logfile stats

\n"; Title("General stats"); DisplayGeneralStats(); Title("Monthly usage graph"); DisplayPageHitsByMonth(); Title("Daily usage graph"); DisplayPageHitsByDay(); Title("Responses"); DisplayResponses(); Title("Pages causing 404 errors"); DisplayNotFoundPages(10); Title("Top referring domains"); DisplayReferrers(30); Title("Search terms"); DisplaySearchTerms(50); Title("User Agents"); DisplayUserAgents(30); #Title("Page details"); #DisplayHitsByPage(); Title("Detail of each page"); DisplayPageDetail(); Title("Top graphics"); DisplayGraphicHits(30); Title("Incoming links to graphics"); DisplayIncomingLinksToGraphics(); DisplayIndex(); DisplayCredits(); print HTML "\n\n"; close HTML; } sub Title() { my $TitleText = shift(); #Add an anchor print HTML "\n
"; #Display the title print HTML "\n\n

$TitleText

"; #Make a link to the index print HTML "\n

Go to index

"; #Add to the index $Index .= "\n

$TitleText

"; } sub DisplayIndex() { print HTML $Index; } sub DisplayResponses() { StartTable(); foreach $Code (sort(keys(%Responses))) { print HTML "\n$Code$Responses{$Code}"; } StopTable(); } sub DisplaySearchTerms() { my $NumberToShow = shift(); DisplayArraySorted(\%SearchTerms, $NumberToShow); } sub DisplayArraySorted() { my $refArrayToDisplay = shift(); my $NumberToShow=shift(); my $MaxHits = GetMax($refArrayToDisplay); my $RefSortedKeys = SortByValue($refArrayToDisplay); StartTable(); GENERAL_DISPLAY: foreach my $Item (@$RefSortedKeys) { my $NumHits = $$refArrayToDisplay{$Item}; print HTML "\n$Item$NumHits".DisplayBar($NumHits,$MaxHits,500 ).""; last GENERAL_DISPLAY if (($NumberToShow--) <= 0); } StopTable(); } sub DisplayNotFoundPages() { my $NumberToShow=shift(); my $MaxHits = GetMax(\%NotFoundPages); my $RefSortedKeys = SortByValue(\%NotFoundPages); StartTable(); ERROR_DISPLAY: foreach $File (@$RefSortedKeys) { my $NumHits = $NotFoundPages{$File}; print HTML "\n$File$NumHits".DisplayBar($NumHits,$MaxHits,500 ).""; last ERROR_DISPLAY if (($NumberToShow--) <= 0); } StopTable(); } sub DisplayReferrers() { my $NumberToShow=shift(); my $MaxHits = GetMax(\%Referrers); my $RefSortedKeys = SortByValue(\%Referrers); StartTable(); REFERRER_DISPLAY: foreach $Referrer (@$RefSortedKeys) { my $NumHits = $Referrers{$Referrer}; print HTML "\n$Referrer$NumHits".DisplayBar($NumHits,$MaxHits,500 ).""; last REFERRER_DISPLAY if (($NumberToShow--) <= 0); } StopTable(); } sub DisplayUserAgents() { my $NumberToShow=shift(); StartTable(); my $Max = GetMax(\%UserAgents); my $RefSortedKeys = SortByValue(\%UserAgents); USERAGENT_DISPLAY: foreach $UserAgent (@$RefSortedKeys) { my $NumHits = $UserAgents{$UserAgent}; my $Percent = int(100 * $NumHits / $Lines); print HTML "\n$UserAgent$NumHits$Percent\%".DisplayBar($NumHits,$Max,350 ).""; last USERAGENT_DISPLAY if (($NumberToShow--) <= 0); } StopTable(); } sub DisplayGraphicHits() { my $NumberToShow=shift(); my $MaxHits = GetMax(\%GraphicHits); my $RefSortedKeys = SortByValue(\%GraphicHits); StartTable(); REFERRER_DISPLAY: foreach $Page (@$RefSortedKeys) { my $NumHits = $GraphicHits{$Page}; print HTML "\n$Page$NumHits".DisplayBar($NumHits,$MaxHits,500 ).""; last REFERRER_DISPLAY if (($NumberToShow--) <= 0); } StopTable(); } sub DisplayIncomingLinksToGraphics() { foreach my $Link(@IncomingLinksToGraphics) { print HTML "

$Link

"; } } sub DisplayBar() { my $Value = shift(); my $MaxValue = shift(); my $MaxWidth = shift(); my $ImageWidth = int($MaxWidth * $Value / $MaxValue); my $ImageHTML = "\"$Value\""; $ImageHTML } sub DisplayGeneralStats() { print HTML "\n

$OkLines logfile lines analysed. $FailedLines lines not understood

"; print HTML "\n

Requests originated from " . scalar(keys(%IPAddresses)) . " different IP addresses"; } sub DisplayCredits() { print HTML "\n

Created by $ProgramName version $Version, on "; ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time); print HTML $year+1900 . "-$mon-$mday $hour:" . sprintf("%02d", $min) . "

"; print HTML "\n

This program is free software, released under the GNU license. See " . "blibbleblobble.co.uk " . "for more details, or to download the program


 

"; } sub DisplayPageHitsByDay() { print HTML "\n

"; my $Max = GetMax( \%PageHitsByDate ); #foreach my $Date(keys(%PageHitsByDate)) #{ # if ($PageHitsByDate{$Date}>$Max) # { # $Max=$PageHitsByDate{$Date} # } #} foreach my $Date(sort(keys(%PageHitsByDate))) { my $HitsThatDay = $PageHitsByDate{$Date}; my $MaxBarHeightInPixels = 200; my $ImageHeight = int($MaxBarHeightInPixels * $HitsThatDay / $Max); if ($ImageHeight<2){$ImageHeight=2;} my $ImageHTML = "\"$HitsThatDay\""; print HTML "\n$ImageHTML"; } print HTML "( max: $Max)

"; } sub DisplayPageHitsByMonth() { #my $Max=GetMax(\%PageHitsByMonth); foreach my $Month(keys(%PageHitsByMonth)) { if ($PageHitsByMonth{$Month}>$Max) { $Max=$PageHitsByMonth{$Month} } } StartTable(); foreach my $Month(sort(keys(%PageHitsByMonth))) { my $MaxBarWidthInPixels = 500; my $ImageWidth = int($MaxBarWidthInPixels * $PageHitsByMonth{$Month} / $Max); my $ImageHTML = "\"\""; print HTML "\n$Month".$PageHitsByMonth{$Month}."$ImageHTML"; } StopTable(); } sub DisplayHitsByPage() { #my $Max = GetMax(\%TotalHitsByPage); foreach my $Page(keys(%TotalHitsByPage)) { if ($TotalHitsByPage{$Page}>$Max) { $Max = $TotalHitsByPage{$Page}; } } my $MaxBarWidthInPixels = 500; StartTable(); print HTML "\n "; foreach my $Page(sort(keys(%TotalHitsByPage))) { my $ImageWidth = int($MaxBarWidthInPixels * $TotalHitsByPage{$Page} / $Max); if ($ImageWidth<2){$ImageWidth=2;} my $ImageHTML = "\"\""; print HTML "$Page".$TotalHitsByPage{$Page}."$ImageHTML$GraphHtml"; } StopTable(); } sub DisplayPageDetail() { my $GraphHtml = ""; my $HitsThatDay = ""; print HTML "\n"; my $RowHeightInPixels = 40; my $MaxBarWidthInPixels= 200; my $MaxWidth = 1; foreach $Page ( sort keys %PageGraph ) { if ($TotalHitsByPage{$Page} > $MaxWidth) { $MaxWidth = $TotalHitsByPage{$Page}; } } foreach $Page ( sort keys %PageGraph ) { print HTML "\n"; my $PageHitsTotal = $TotalHitsByPage{$Page}; my $ImageWidth = int($MaxBarWidthInPixels * $PageHitsTotal / $MaxWidth); if ($ImageWidth<2){$ImageWidth=2;} my $ImageHTML = "\"\""; print HTML ""; if ($PageHitsTotal > 3) { my $LastDate; my $Spaces; my $First = 1; my $BarWidth = 2; my $Max=1; for $Date ( sort keys %{ $PageGraph{$Page} } ) { my $HitsThatDay = $PageGraph{$Page}{$Date}; if ($HitsThatDay > $Max) { $Max = $HitsThatDay; } } print HTML ""; } else #Too few hits to draw graph { print HTML ""; } print HTML ""; } print HTML "\n
$Page".$TotalHitsByPage{$Page}."$ImageHTML"; for $Date ( sort keys %{ $PageGraph{$Page} } ) { my $HitsThatDay = $PageGraph{$Page}{$Date}; if ($First == 1) { print HTML "$Date  "; } if ($First == 0) { $Date =~ /(\d+)\-(\d+)\-(\d+)/; $Date1 = ($1 * 365) + ($2 * 30) + ($3); $LastDate =~ /(\d+)\-(\d+)\-(\d+)/; $Date2 = ($1 * 365) + ($2 * 30) + ($3); $DateDiff = $Date1 - $Date2; for (my $i=0; $i<$DateDiff;$i++) { print HTML "\""; } } my $BarHeight = $RowHeightInPixels * $HitsThatDay / $Max; print HTML "\"$Date"; $LastDate = $Date; $First = 0; } #Label the date of the last hit to that page (i.e. the rightmost bar on the graph) print HTML "  $LastDate"; print HTML "
"; } #Get a hash of (ID => Number)'s and return the largest number sub GetMax() { my $Array = shift(); my $Max = 1; foreach $Element(keys(%$Array)) { my $ElementValue = $$Array{$Element}; if ($ElementValue > $Max) { $Max = $ElementValue; } } $Max } sub SortByValue() #Descending { my $Array = shift(); my @SortedKeys=(); foreach $Key(sort {$$Array{$b} <=> $$Array{$a}} (keys( %$Array ))) { push(@SortedKeys, $Key); } \@SortedKeys } sub StartTable(){ print HTML "\n"; } sub StopTable(){ print HTML "\n
\n"; } #sub SortHits() #$TotalHitsByPage{$Page} #Analyse a line of the logfile sub AnalyseLine() { my $Line = shift(); $Lines++; if ($Line =~ /.*?(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] \"(.*?)\" (\d+)(.*)/) { my $IP = $1; my $Date = $2; my $Request = $3; my $Response = $4; my $ExtraDetail = $5; my $RequestType; my $File; my $HttpVersion; my $Directory; my $Filestem; my $Extension; my $Query; my $FullReferrer; my $Referrer; my $UserAgent; my $IsPage = 0; my $IsGraphic = 0; my $IsOther = 0; my $DateFormatted; my $Month; my $Year; my $DayOfMonth; my $MonthName; my $Time; #Parse the request to get type, filename, and http version if ($Request =~ /(.*?) (.*) HTTP\/(.*)/) { $RequestType = $1; $File = $2; $HttpVersion = $3; } if ($File =~ /(.*?)\?/) { $File = $1; } #Parse the filename to get directory, stem, and extension if ($File =~ /(.*)\/(.*)\.(.*)/) { $Directory = $1; $Filestem = $2; my $Extension = $3; if ($Extension =~ /(htm|html|php|php.|txt)/) { $IsPage = 1; } else { if ($Extension =~ /(bmp|png|jpg|jpeg|gif|ico)/i) { $IsGraphic = 1; } else { $IsOther = 1; } } } #parse the second half of the line for browser and referring page if ($ExtraDetail =~ /\"(.*?)\" \"(.*?)\".*/) { $Referrer = $1; $FullReferrer=$1; $UserAgent = $2; } #Investigate the referrer (take only the servername) if ($Referrer =~ /http\:\/\/(.*?)\//) { $Referrer = $1; } #Investigate the user agent if ($UserAgent =~ /^Mozilla\/(\d+\.\d+)(.*)/) { my $MozillaVersion = $1; my $RestOfString = $2; if ($RestOfString =~ /(MSIE|Opera|Galeon|Konqueror|OmniWeb|ZyBorg|Ask Jeeves|Ask Jeeves)/) { $UserAgent = "$1"; } else { $UserAgent = "Mozilla $MozillaVersion"; } } #convert the date if ($Date =~ /(\d{2})\/(.{3})\/(\d{4})\:(\d{2})\:(\d{2})\:(\d{2}) (\+|\-)(\d{4})/) { $DayOfMonth = $1; $MonthName = $2; $Year = $3; $Time = $4 + (($5 + ($6 / 60)) / 60); $Month = $Months{$MonthName}; $DateFormatted = "$Year-$Month-$DayOfMonth";#w".int($1 / 15); } if ($IsPage) { #Add data to lists $PageHitsByDate{$DateFormatted}++; $PageHitsByMonth{"$Year-$Month"}++; $TotalHitsByPage{$File}++; $PageGraph{$File}{$DateFormatted}++; } elsif ($IsGraphic) { $GraphicHits{$File}++; #If referrer to a graphic is not in this domain if (($Referrer!="") && ($Referrer != "-")) { if ((index($FullReferrer,$BaseDomain)<0) && (index($FullReferrer,"http">-1))) { push (@IncomingLinksToGraphics,$FullReferrer); } } } $Referrers{$Referrer}++; $UserAgents{$UserAgent}++; $IPAddresses{$IP}++; $Responses{$Response}++; #Look for a google-search in the referring page if ($FullReferrer =~ /google.*?\?.*?q\=(.*?)\&/ ) { #Lowercase everything for convenience of comparing them $SearchTerm = lc($1); #Translate the quoted characters into ascii $SearchTerm =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/ge; #Translate spaces $SearchTerm =~ s/\+/ /g; #Remove non-alphabetic characters $SearchTerm =~ s/\W/ /g; #Collapse whitespace $SearchTerm =~ s/\s+/ /g; #Ignore anything asking for a cached copy if (index($SearchTerm,"cache:")<0) { $SearchTerms{$SearchTerm}++; } } if ($Response == 404) { $NotFoundPages{$File}++; } $OkLines++; } else { $FailedLines++; } } print "\n";