###################################################################### # # ScanGraphics.pl - the much-requested follow-up to ScanGraphics, the # program for ripping off peoples' pictures from their web sites. # # You will need perl to run this program. Assuming you have windows, # get perl from www.activestate.com. If you have a better o/s, you # probably already have perl. The web site is www.perl.com # # Usage: # c:\perl\perl.exe ScanSite.pl http://www.netlife.lu/ # (perl program) (this script) (URL of web site) # # Program to scan a web site, find all the graphics from it, and put # links to them into an HTML page in the current directory names gfx.htm # # This program does not download the graphics, just lists them. Open the # gfx.htm file in any web browser to view the graphics. # # This is not a polite program. It ignores any robots.txt files or other # robot directives, and it loads pages as fast as your network connection # will allow. This will seriously annoy many server administrators, so # use this program infrequently with caution. For large-scale projects, # I suggest you put a wait(1 minute) command in, and run a version of this # program in a different process for each web site you're interested in, # then leave it running overnight. # # Technical: # * Follows all 'downward' links to find more web pages # (i.e. not out-bound links, not ../ links, and not / links) # # * Similar rules for finding photos - it won't index any pics from another # web site, or pics which are not in a lower-level directory than the page # being indexed. This is designed to stop it indexing (a) adverts # (b) bullets (c) any other crap on a site's /gfx directory. # # * Flushes the output HTML file after every 10 pages indexed. # # Oliver White, August 2000, www.blibbleblobble.co.uk/Programming/Perl # ################################################################################## $true=1;$false=0; #Options $OutputFilename = "gfx.htm"; $FlushOutput = $true; #Internet library use LWP::Simple; #Get command line argument $BaseFile = shift(); #Turn into array @Files = ($BaseFile); open (GFX, ">$OutputFilename") || die(); $Title = "Graphics from $BaseFile"; #HTML header print GFX "\n\n$Title\n\n\n

$Title

\n"; #Get next page from stack until stack is empty while (($File = shift(@Files)) ) { #Option to flush file if ($FlushOutput == $true) { if (($i++ % 10) == 0) { close GFX; open (GFX, ">>$OutputFilename") || die(); } } #Update screen print "\n$File"; #Try to get directory of current file if ($File =~ /(.*)[\\\/]/) { $Directory = $1; } $HTML = get($File); #Get all images from page while( $HTML =~ /]*?src=\"(.*?)\"/gi) { #Only use downward-links if ( ($1 !~ /:/) && ($1 !~ /\.\./) && ($1 !~ /^[\\\/]/) ) { $Image = "$Directory/$1"; print GFX "\n

\"$Image\"

"; print "."; } } #Get all links from page while( $HTML =~ /]*?href=\"(.*?)\"/gi) { #Only use downward-links if ( ($1 !~ /:/) && ($1 !~ /\.\./) && ($1 !~ /^[\\\/]/) && ($1 !~ /\#/) && ($1 !~ /\.(gif|jpg|jpeg|bmp)$/) ) { $NewLink = "$Directory/$1"; push(@Files, $NewLink); } } } #HTML footer, and close file print GFX "

Created by ScanGraphics.pl, from BlibbleBlobble.co.uk

\n

If you have problems persuading IE to load images, try navigator

\n\n\n\n"; close GFX;