Sunday, March 16, 2008

Perl Web Spider/Robot For Linux or Unix

Hey There,

Here's a fun lazy-Sunday post. This Perl script, along with the script we already showcased to do massive URL downloading on Linux/Unix or using Windows Perl, is part of a bigger project we're working on, but it's fun on it's own, too.

This script is a little better at showing how to use the Getopt module, as well as the WWW::Robot and LWP::UserAgent Perl modules. It basically provides an easy way to download files of any specific type you enter at the command line, or just "spider" a site (by not naming any file extensions).

You should note that we do include a section to comply with the Robot Exclusion Protocol. You can take that out if you don't want to play nice with sites that don't want to get spidered, but it would be courteous not to :)

You can call this Perl script simply, like so:

host # ./grabber http://www.xyz.com jpg gif

or

host # ./grabber http://www.xyz.com <-- If you just want to spider

or choose from the multitude of options that are listed near the top of the script in the comments section.

Hope you enjoy this, and have some fun with it. Enjoy your Sunday :)

Cheers,


Creative Commons License


This work is licensed under a
Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License

#!/usr/bin/perl

# grabber - Grab pix or spider around.
# Usage: grabber [-d] [-u] [base_url [file ext 1]...[file ext n].
# Ex: grabber http://xyz.com/ jpg gif.
#
# 2008 - Mike Golvach - eggi@comcast.net
#
# Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License
#

# The Defaults: What you get if you settle.

$opt_u = 0; # No guaranteed uniqueness of downloads.
$opt_l = 0; # Search not bound to domain of base url.
$opt_d = 0; # No directories created for downloads to
# mimic site structure somewhat.
$opt_s = "NONE"; # No Search parameters set to maximize
# efficiency of search.
$opt_b = 0; # Don't try standard default pages if
# using a directory url fails.
$opt_a = 0; # Don't push cgi buttons.
$opt_p = "NONE"; # Direct Connection, no proxy.
$opt_e = 0; # No verbose messaging.
$opt_h = 0; # No help screen.

# Check for command line switches before proceeding;

@bargv = @ARGV;
use Getopt::Std;

$result = getopts('ulds:bap:ehvwxyz') || &usage;

if ( $#ARGV < 1 || ! $result ) {
&usage;
}

if ( $opt_h ) {
&usage;
exit;
}

use WWW::Robot;

# Flush the output for good measure.

$| = 1;
$okay = 1;
$farmfresh = 1;
$errorprone = 1;
$homebase = `pwd`;
chomp $homebase;

$uniquifier = 0 if $opt_u;

# Parse the remaining Arg vector

$base_url = shift @ARGV;
@file_types = @ARGV;

# For list of file types, build a little regular expression.

foreach $ft (@file_types) {
$pcregexp .= "$ft\|";
}
$pcregexp =~ s/\|$//;

# Set up seemingly valid attributes for the robot so it
# doesn't get bashed immediately. Slightly unethical, but
# we've included a valid email for complaints.

$hummer = new WWW::Robot ( 'NAME' => 'Mozzilla4.0',
'VERSION' => '1.001',
'EMAIL' => 'me@xyz.com',
'TRAVERSAL' => 'depth',
'VERBOSE' => '0');

die if ( !defined $hummer );

# Robot creation successful. Define initial directory structure
# if necessary.

if ( $opt_l ) {
$leash = $base_url;
$leash =~ s/http:\/\/([^\/]+)\/.*/$1/;
}

if ( $opt_d ) {
$basic = $base_url;
$basic =~ s/(.*\/).*$/$1/;
}

# Describe hook functions of robot.

$hummer->addHook('follow-url-test', \&url_test);
$hummer->addHook('invoke-on-get-error', \&get_error);
$hummer->addHook('invoke-on-contents', \&contents);

# Set to only check html on pages and set no delay between GETs.

$hummer->setAttribute('IGNORE_TEXT', 0);
$hummer->setAttribute('REQUEST_DELAY', 0);

# proxy needed for maximum utility on any server.

if ( $opt_p ne "NONE" ) {
if ( $opt_p !~ /\/$/ ) {
$opt_p =~ s/$/\//;
}
$hummer->proxy(['http','ftp'], "$opt_p");
}

# Main function, pulls in Hooks (above).

$hummer->run($base_url);

# sub: url_test: Decide whether or not to follow link.

sub url_test {

my ($robot, $hook_name, $url) = @_;
my $prefix;

# No mail, file, query-string cgi or ldap urls.
# Return 0 to short-circuit loop.

if ( $opt_a ) {
return 0 if ( $url =~ /\.pl|\.cgi\?/ );
}
return 0 if ( $url =~ /(mailto:|ldap:|file:\/\/)/ );

# Strip front and end to get base url.

$prefix = $url;
$prefix =~ s/http:\/\/([^\/]+)\/.*/$1/;
chomp($prefix);

## Uncomment this to keep the scavenging internal.

if ( $opt_l ) {
if ( $prefix !~ /$leash/i ) {
if ( $farmfresh && $opt_b ) {
foreach $bittybit (@bargv) {
if ( $bittybit =~ /http/i ) {
if ( $bittybit !~ /\/$/ ) {
$bittybit =~ s/$/\//;
}
}
}
$bargv = join(" ", @bargv);
if ( $opt_v ) {
$bargv =~ s/(^.*)\.html[ \/](.*$)/${1}.htm $2/;
$bargv =~ s/-v//;
exec "$0 -w $bargv";
} elsif ( $opt_w ) {
$bargv =~ s/(^.*)index\.htm[ \/](.*$)/${1}default.htm $2/;
$bargv =~ s/-w//;
exec "$0 -x $bargv";
} elsif ( $opt_x ) {
$bargv =~ s/(^.*)\.htm[ \/](.*$)/${1}.html $2/;
$bargv =~ s/-x//;
exec "$0 -y $bargv";
} elsif ( $opt_y ) {
$bargv =~ s/(^.*)default\.html[ \/](.*$)/${1}default.asp $2/;
$bargv =~ s/-y//;
exec "$0 -z $bargv";
} elsif ( $opt_z ) {
print "$url not traversable in current context. Try the -b switch!\n";
print "Bailing out...\n";
} else {
$bargv =~ s/(^.*http.*) (.*$)/${1}index.html $2/;
exec "$0 -v $bargv";
}
}
return 0;
}
}
$farmfresh = 0;

if ( $opt_s ne "NONE" ) {
if ( $url !~ /$opt_s/i ) {
$okay = 0;
} else {
$okay = 1;
}
}

## Uncomment this to become annoyed at the amount of text on your screen.

if ( $opt_e ) {
print "Scanning ${url}...\n";
}

# We made it here, must be good. Return 1 for invoke-on-contents.

return 1;
}

# sub get_error: When things go wrong, keep track of them.
# Invoked from with url_test or contents, generally by bad links.

sub get_error {

my ($robot, $hook, $url, $response) = @_;

# Uncomment this block to track error messages.

if ( $opt_e ) {
print "Received " . $response->code . " for $url\n";
}

if ( $errorprone && $opt_b ) {
foreach $bittybit (@bargv) {
if ( $bittybit =~ /http/i ) {
if ( $bittybit !~ /\/$/ ) {
$bittybit =~ s/$/\//;
}
}
}
$bargv = join(" ", @bargv);
if ( $opt_v ) {
$bargv =~ s/(^.*)\.html[ \/](.*$)/${1}.htm $2/;
$bargv =~ s/-v//;
exec "$0 -w $bargv";
} elsif ( $opt_w ) {
$bargv =~ s/(^.*)index\.htm[ \/](.*$)/${1}default.htm $2/;
$bargv =~ s/-w//;
exec "$0 -x $bargv";
} elsif ( $opt_x ) {
$bargv =~ s/(^.*)\.htm[ \/](.*$)/${1}.html $2/;
$bargv =~ s/-x//;
exec "$0 -y $bargv";
} elsif ( $opt_y ) {
$bargv =~ s/(^.*)\.html[ \/](.*$)/${1}.asp $2/;
$bargv =~ s/-y//;
exec "$0 -z $bargv";
} elsif ( $opt_z ) {
print "Bailing out...\n";
} else {
$bargv =~ s/(^.*http.*) (.*$)/${1}index.html $2/;
exec "$0 -v $bargv";
}
}
# Return Code not important here.

return 1;
}

# sub contents: Get the pictures, finally...

sub contents {

my ($robot, $hook, $url, $response, $structure, $filename) = @_;

$errorprone = 0;
# Strip everything but the suffix of the filename.

$suffix = $url;
$suffix =~ s/^.*\/[a-zA-Z0-9_]+\.(.*)$/$1/;

# Define types of files we're interested in and hope for a match.

if ( $suffix =~ /$pcregexp/i && $okay ) {
# Create UserAgent. It needs a proxy setting, too, and realistic
# attributes. This will grab the pix for us. Creating and destroying
# the instance for each subsequent GET.

use LWP::UserAgent;

if ( $opt_d ) {
$difference = $url;
$difference =~ s/$basic(.*\/).*/$1/;
if ( $difference ne $base_url ) {
@mkdirs = split(/\//, $difference);
foreach $dirjob (@mkdirs) {
if ( ! -d $dirjob ) {
mkdir $dirjob, 0755;
chdir $dirjob;
} else {
chdir $dirjob;
}
}
}
}
# Flush the output again, just to be obsessive, and cleanse url of
# all but the full filename. Flushing output shouldn't eliminate the
# possibility of garbage choking up the buffer when we save the pix.

$| = 1;
$filename = $url;
$filename =~ s/^.*\/([^\/]+)$/$1/;
$grabber = new LWP::UserAgent;
if ( $opt_p ne "NONE" ) {
if ( $opt_p !~ /\/$/ ) {
$opt_p =~ s/$/\//;
}
$grabber->proxy(http => "$opt_p");
}
$grabber->agent("Mozilla/4.0" . $grabber->agent);

# Perform the GET.

my $request = new HTTP::Request('GET', "$url");

# Grab the result code to short-circuit if we failed.

my $result = $grabber->request($request);

if ( $result->is_success ) {

# Success? If file of same name has already been grabbed - some folks
# engage in the dangerous practice of naming their thumbnails the same
# as the full sized pix - add unique number to front of name. Save!

if ( $opt_u ) {
if ( -f $filename ) {
$filename = $uniquifier . $filename;
$uniquifier++;
}
}

$structure = $response->content;
open(TMPFILE, ">>$filename");
print TMPFILE $structure;
close(TMPFILE);

# Uncomment this for a textual indication of your success.

print STDERR "Got $filename!!!\n";
} else {

# If you get here, you probably need to check your proxy settings.

print STDERR "No go for $url\n";
}
chdir $homebase;
}
}

sub usage {

print "Usage: $0 [-d] base_url [file ext 1]...[file ext n]\n";
exit;
}


, Mike