################################################################ # # Read options from file # ################################################################ #Read options file $options = GetFile ("Options.txt"); #Extract options ($RootDir) = ( $options =~ /^website\s*=\s*(.*?)\n/mi); ($HtmlExtensions) = ( $options =~ /^HTML files\s*=\s*(.*?)\n/mi); ($GraphicsExtensions) = ( $options =~ /^Graphics files \s*=\s*(.*?)\n/mi); ($OutputPrefix) = ( $options =~ /^output prefix\s*=\s*(.*?)\n/mi); ($BaseURL) = ( $options =~ /^URL\s*=\s*(.*?)\n/mi); ($CalculateImageSizes) = ( $options =~ /^Show image sizes\s*=\s*(.*?)\n/mi); sub ScanSite() { $RootLen = length($RootDir); #Display folder name print "\nChecking website: $RootDir"; ################################################################ # # Open files # ################################################################ #Create a list of folders open (FOLDERS, "> ${OutputPrefix}Folders.txt") || die(); open (FILES, "> ${OutputPrefix}Files.txt") || die(); open (SUMMARY, "> ${OutputPrefix}Summary.htm") || die(); open (CATEGORIES, "> ${OutputPrefix}Categories.txt") || die(); open (LINKS, "> ${OutputPrefix}Links.txt") || die(); open (UNTITLED, "> ${OutputPrefix}UntitledFiles.txt")|| die(); open (STATS, "> ${OutputPrefix}Statistics.txt") || die(); open (IMAGES, "> ${OutputPrefix}Images.htm") || die(); print SUMMARY HTML_Header("Summary"); print CATEGORIES "#Number, title, sub-folder of\n\n"; print FILES "#URL, category, title, keywords\n\n"; print LINKS "#URL, Description\n\n"; print UNTITLED "The following files do not have any title:\n"; print IMAGES HTML_Header("Images from the website"); ################################################################ # # Initialise local variables # ################################################################ #Create a list of folders to index @Folders = ($RootDir); @ParentFolders = (-1); $ListedLinks = ""; $CountFiles=0; $CountFolders=0; $CountHTMLFiles=0; $CountGraphicsFiles=0; ################################################################ # # Scan folders # ################################################################ #Loop through folders while ($Folder = shift(@Folders)) { $CountFolders++; $ParentFolder = shift(@ParentFolders); #Get relative folder name $RelativeFolder = substr($Folder, $RootLen + 1 ); if ($Folder =~ /[\\\/](\w+)$/) { $FolderName = $1; } else { $FolderName = $RelativeFolder; } if ($FolderName eq "_notes") { next; } $Category++; #Write data to files print FOLDERS "\n$RelativeFolder"; print SUMMARY "\n

$RelativeFolder

"; print CATEGORIES "\n" . $Category . ", $FolderName, $ParentFolder"; #Read directory opendir (DIR, $Folder); while ($Filename = readdir(DIR)) { $CountFiles++; #Get filenames $TrueFilename = "$Folder\\$Filename"; $RelativeFilename = substr( $TrueFilename, $RootLen + 1 ); $RelativeFilename =~ tr{\\}{\/}; #If not a . or .. pseudo-directory if ($Filename !~ /^(\.|\.\.)$/) { #If a folder (with no "." in it) if ($Filename !~ /\./) { #Add to stack of folders push (@Folders, $TrueFilename); push (@ParentFolders, $Category); } else { #If a graphics file if ($Filename =~ /.*\.$GraphicsExtensions$/i) { $CountGraphicsFiles++; #Calculate size if necessary (slow) if ($CalculateImageSizes) { $ImageSize = -s $TrueFilename; $ImageSizeText = " (size: $ImageSize bytes)" } #List in graphics print IMAGES "\n\n
\n

\n\t\"$TrueFilename\"\n\t
\n\t$TrueFilename$ImageSizeText\n

"; } #If an HTML file if ($Filename =~ /.*\.$HtmlExtensions$/i) { $CountHTMLFiles++; ############################################# # #Get HTML file # ############################################# open (HTML, $TrueFilename)|| die(); @Lines = ; $html = join("", @Lines); close HTML; ############################################# # # Analyse HTML file # ############################################# #Extract title if ($html =~ /(.*?)<\/title>/i) { $title = $1; } elsif ($html =~ /<h1 .*?>(.*?)<\/h1>/i) { #Otherwise use first header $title = $1; } else { #Otherwise title not available $title = "untitled "; push (@UntitledFiles, $TrueFilename); $UntitledFiles = 1; } $title =~ s/\n|,/ /g; #Extract keywords if ($html =~ /<meta\s+name=\"keywords\"\s+content\s*=\s*\"(.*?)\"\s*>/is) { $keywords = $1; } else { $keywords=join(", ", split(/ /, $title)); } $keywords =~ tr/\n,/ -/; #Look for external links while ($html =~ /<a\s+.*?href=\"(http:\/\/.*?)\".*?>(.*?)<\/a>/gsi) { if ($ListedLinks !~ /$1/) { push (@ExternalLinks, $1); push (@ExternalLinkDescriptions, $2); $ListedLinks .= "$1 "; } } ############################################# # # Write data to file # ############################################# print FILES "\n${BaseURL}${RelativeFilename}, $Category, $title, $keywords"; print SUMMARY "\n\t<p><a href=\"$TrueFilename\">$RelativeFilename</a></p>"; } } } } closedir DIR; } ################################################################ # # Write other useful data to files # ################################################################ while ($Link = shift(@ExternalLinks)) { $Description = shift(@ExternalLinkDescriptions); $Description =~ s/\<img.*?\>/(image)/gs; $Description =~ s/<.*?>//g; $Description =~ tr/\n,/ -/; print LINKS "\n$Link, $Description"; } if ($UntitledFiles) { $NumberUntitledFiles = @UntitledFiles; print UNTITLED "($NumberUntitledFiles files)\n"; while ($Filename = shift(@UntitledFiles)) { print UNTITLED "\n" . $Filename; } } print SUMMARY HTML_Footer(); print IMAGES HTML_Footer(); print STATS "\nNumber of files: $CountFiles"; print STATS "\nNumber of folders: $CountFolders"; print STATS "\nNumber of HTML files: $CountHTMLFiles"; print STATS "\nNumber of graphics files: $CountGraphicsFiles"; ################################################################ # # Close files # ################################################################ close FOLDERS; close FILES; close SUMMARY; close CATEGORIES; close LINKS; close UNTITLED; close STATS; close IMAGES; } sub GetFile () { open (FILE, $_[0]) || die ("Could not open ".$_[0]); my @Lines = <FILE>; close FILE; join("",@Lines); } sub HTML_Header() { "<html\n><head>\n\t<title>" . $_[0] . "\n\n\n"; } sub HTML_Footer() { "\n\n\n\n"; }