Tuesday, December 25, 2007

Website and URL Downloading Using ActiveState Perl for Windows

Merry Christmas:) Again, my apologies to those of you who don't observe the holiday.

Today's post (inlined scripts aside) will be nice and short. I'll leave it to you to refer to yesterday's post if you want more information, or commentary, regarding the basics of this script and how to use it, as it's almost exactly the same.

These are special versions of the dUrl and dUpeDL scripts that I've ported to use ActiveState's Perl for Windows. This comes in handy when you want to do some massive downloading and don't have access to a Linux or Solaris box.

Note that the one requirement of the original script, "wget," is also needed for this version of the script to work. You can download that for free at http://www.christopherlewis.com/WGet/default.htm. Also, again, note that you should modify the lines in dUrl that contain the dUpeDL script reference, as they indicate a fictitious location for it!

Enjoy the day off of work, the company of your family and/or friends and, hopefully, some much deserved peace and rest.

Cheers!


Creative Commons License


This work is licensed under a
Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License

#!c:\perl\bin\perl

#
# 2007 - Mike Golvach - eggi@comcast.net - beta v.000000000000000001a
#
# Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License
#

use LWP::Simple;

if ( $#ARGV < 0 ) {
print "Usage: $0 [URL|-f URL file]\n";
print "URL must be in http:// format - no https yet\n";
exit(1);
}

$debug=0;
$multi=0;
$counter=1;
# simple for now - better save-system later, maybe...
# also, we'll make the shared download system a function

if ( $ARGV[0] eq "-f" ) {
if ( ! -f $ARGV[1] ) {
print "Can't find URL file $ARGV[1]!\n";
exit(2);
}
$multi_file=$ARGV[1];
$multi=1;
chomp($download_dir="$ARGV[1]");
$download_dir =~ s/\//_/g;
$download_dir =~ s/\\/_/g;
if ( ! -d $download_dir ) {
system("mkdir $download_dir");
}
if ( ! -d $download_dir ) {
print "Can't make Download Directory ${download_dir}!\n";
print "Exiting...\n";
exit(2);
}
} else {
chomp($download_dir="$ARGV[0]");
if ( $download_dir !~ /^http:\/\//i ) {
print "Usage: $0 [URL|-f URL file]\n";
print "URL must be in http:// format - no https yet\n";
exit(1);
}
$download_dir =~ s/.*\/\/([^\/]*).*/$1/;
if ( ! -d $download_dir ) {
system("mkdir $download_dir");
}
if ( ! -d $download_dir ) {
print "Can't make Download Directory ${download_dir}!\n";
print "Exiting...\n";
exit(2);
}
}

if ( $multi == 0 ) {
@dl_list=();
$url="@ARGV";
chomp($url);
print "Parsing URL $url...\n";
$dl = get("$url");
@dl = split(/[><]/, $dl);
print "Feeding $url To The Machine...\n";
foreach $dl_item (@dl) {
next if ( $dl_item !~ /(href|img)/ );
next if ( $dl_item !~ /http:\/\// );
next if ( $dl_item !~ /(jpg|jpeg|gif|png)/ );
$dl_item =~ s/(a href|img src)=('|")//;
$dl_item =~ s/('|").*//;
push(@dl_list, $dl_item);
}
$is_it_worth_it = @dl_list;
if ( $is_it_worth_it == 0 ) {
print "No Image References found!\n";
print "No point in continuing...\n";
print "Moving $download_dir to ${download_dir}.empty...\n";
rename("$download_dir", "${download_dir}.empty");
exit(4);
}
print "Churning Out URL Requests...\n";
if ( $debug == 0 ) {
print "j=jpg g=gif p=png ?=guess\n";
}
chomp($this_dir=`cd`);
chdir("$download_dir");
$start_time=(time);
foreach $dl_req (@dl_list) {
$tmp_dl="";
$req_filename = $dl_req;
$req_filename =~ s/.*\///;
if ( $debug ) {
print "Grabbing $req_filename\n";
} else {
$file_ext = $req_filename;
$file_ext =~ s/.*(jpg|gif|png).*/$1/;
if ( $file_ext !~ /(jpg|gif|png)$/ ) {
print "\?";
} else {
$file_ext =~ s/^(\w).*/$1/;
print "$file_ext";
}
}
# Work that bastard extra hard if it's a PHP Trick-Link
if ( $dl_req =~ /php\?/ ) {
$dl_req =~ s/\&/\\&/g;
system("wget.exe -q $dl_req");
} else {
# We need wget.exe because the Simple GET can't follow trails
system("wget.exe -q $dl_req");
}
}
$end_time=(time);
$seconds = sprintf("%d", $end_time - $start_time);
print "...DONE in $seconds seconds!\n";
# PHP links are a pain -
print "Looking for PHP Trick-Links...\n";
chdir("$download_dir");
@file_list=`dir /B *php*`;
$file_list=@file_list;
if ( $file_list ) {
print "PHP Trick-Links Found. Attempting To Unravel...\n";
foreach $php_file (@file_list) {
chomp($php_file);
open(PHPFILE, "<$php_file");
@php_file = <PHPFILE>;
if ( $php_file =~ /img.php/ ) {
print "IMG - ";
foreach $php_seg (@php_file) {
if ( $php_seg =~ /SRC=/ ) {
$php_tail = $php_seg;
$php_tail =~ s/.*SRC=\"(.*?)\">.*/$1/;

$php_real_url = $php_root . $php_tail;
} elsif ( $php_seg =~ /HREF=http/ ) {
$php_root = $php_seg;
$php_root =~ s/.*=(http:\/\/[^\/]*\/).*/$1/;
chomp($php_root);
}
$php_real_url = $php_root . $php_tail;
}
} else {
print "REGULAR - ";
foreach $php_seg (@php_file) {
if ( $php_seg =~ /url=http/ ) {
$php_real_url=$php_seg;
$php_real_url =~ s/.*url=(http.*?)&.*/$1/;
}
}
}
close(PHPFILE);
if ( $debug ) {
print "Deleting Bogus Download: $php_file\n";
} else {
print "X=";
}
unlink("$php_file");
if ( $debug ) {
print "Downloading Real URL : $php_real_url";
} else {
$php_file_ext = $php_real_url;
$php_file_ext =~ s/.*(jpg|gif|png).*/$1/;
if ( $php_file_ext !~ /(jpg|gif|png)$/ ) {
print "\?";
} else {
$php_file_ext =~ s/^(\w).*/$1/;
chomp($php_file_ext);
print "$php_file_ext ";
}
}
system("wget.exe -q $php_real_url");
}
print "...Done!\n";
} else {
print "No PHP Trick-Links To Unravel... Good\n";
}
chdir("$download_dir");
# Trying more sophisticated MD5 duplicate checking
print "Checking for exact duplicates MD5-Sum+Size\n";
system("c:\\docume~1\\user\\desktop\\dUpeDL.pl");
chdir("$this_dir");
} elsif ( $multi == 1 ) {
open(MULTIFILE, "<$multi_file");
@multi_file = <MULTIFILE>;
close(MULTIFILE);
print "------------------- MULTIFILE MODE ------------------------\n";
foreach $multifile_entry (@multi_file) {
@dl_list=();
print "-------------------- FILE $counter ------------------------\n";
$url="$multifile_entry";
if ( $url !~ /^http:\/\//i ) {
print "Usage: $0 [URL|-f URL file]\n";
print "URL must be in http:// format - no https yet\n";
exit(1);
}
chomp($url);
print "Parsing URL $url...\n";
$dl = get("$url");
@dl = split(/[><]/, $dl);
print "Feeding $url To The Machine...\n";
foreach $dl_item (@dl) {
next if ( $dl_item !~ /(href|img)/ );
next if ( $dl_item !~ /http:\/\// );
next if ( $dl_item !~ /(jpg|jpeg|gif|png)/ );
$dl_item =~ s/(a href|img src)=('|")//;
$dl_item =~ s/('|").*//;
push(@dl_list, $dl_item);
}
$is_it_worth_it = @dl_list;
if ( $is_it_worth_it == 0 ) {
print "No Image References found!\n";
print "Trying next FILE\n";
}
print "Churning Out URL Requests...\n";
if ( $debug == 0 ) {
print "j=jpg g=gif p=png ?=guess\n";
}
chomp($this_dir=`cd`);
chdir("$download_dir");
$start_time=(time);
foreach $dl_req (@dl_list) {
$tmp_dl="";
$req_filename = $dl_req;
$req_filename =~ s/.*\///;
if ( $debug ) {
print "Grabbing $req_filename\n";
} else {
$file_ext = $req_filename;
$file_ext =~ s/.*(jpg|gif|png).*/$1/;
if ( $file_ext !~ /(jpg|gif|png)$/ ) {
print "\?";
} else {
$file_ext =~ s/^(\w).*/$1/;
print "$file_ext";
}
}
if ( $dl_req =~ /php\?/ ) {
$dl_req =~ s/\&/\\&/g;
system("wget.exe -q $dl_req");
} else {
system("wget.exe -q $dl_req");
}
}
$end_time=(time);
$seconds = sprintf("%d", $end_time - $start_time);
print "...DONE in $seconds seconds!\n";
print "Looking for PHP Trick-Links...\n";
chdir("$download_dir");
@file_list=`dir /B *php*`;
$file_list=@file_list;
if ( $file_list ) {
print "PHP Trick-Links Found. Attempting To Unravel...\n";
foreach $php_file (@file_list) {
chomp($php_file);
open(PHPFILE, "<$php_file");
@php_file = <PHPFILE>;
if ( $php_file =~ /img.php/ ) {
print "IMG - ";
foreach $php_seg (@php_file) {
if ( $php_seg =~ /SRC=/ ) {
$php_tail = $php_seg;
$php_tail =~ s/.*SRC=\"(.*?)\">.*/$1/;

$php_real_url = $php_root . $php_tail;
} elsif ( $php_seg =~ /HREF=http/ ) {
$php_root = $php_seg;
$php_root =~ s/.*=(http:\/\/[^\/]*\/).*/$1/;
chomp($php_root);
}
$php_real_url = $php_root . $php_tail;
}
} else {
print "REGULAR - ";
foreach $php_seg (@php_file) {
if ( $php_seg =~ /url=http/ ) {
$php_real_url=$php_seg;
$php_real_url =~ s/.*url=(http.*?)&.*/$1/;
}
}
}
close(PHPFILE);
if ( $debug ) {
print "Deleting Bogus Download: $php_file\n";
} else {
print "X=";
}
unlink("$php_file");
if ( $debug ) {
print "Downloading Real URL : $php_real_url";
} else {
$php_file_ext = $php_real_url;
$php_file_ext =~ s/.*(jpg|gif|png).*/$1/;
if ( $php_file_ext !~ /(jpg|gif|png)$/ ) {
print "\?";
} else {
$php_file_ext =~ s/^(\w).*/$1/;
chomp($php_file_ext);
print "$php_file_ext ";
}
}
system("wget.exe -v $php_real_url");
}
print "...Done!\n";
} else {
print "No PHP Trick-Links To Unravel... Good\n";
}
chdir("$download_dir");
# Trying more sophisticated MD5 duplicate checking
print "Checking for exact duplicates MD5-Sum+Size\n";
system("c:\\docume~1\\user\\desktop\\dUpeDL.pl");
chdir("$this_dir");
$counter++;
}
}

$|=1;

if ( $multi == 1 ) {
chdir("$this_dir");
rename("$multi_file", "${multi_file}.done");
}
exit(0);


---- dUpeDL - Based almost entirely on the findDupeFiles script by Cameron Hayne (macdev@hayne.net) - modified for win32

#!c:\perl\bin\perl

#
# dUpeDL - Based on the following script - only slightly modified to work with
# dUrl and Windows Perl.
# Below: The original liner notes for full attribution to the original author.
# Note that the attribution was taken verbatim from the Linux/Unix script and may not
# be entirely accurate due to the fact that this script is a win32 port.
#
# findDupeFiles:
# This script attempts to identify which files might be duplicates.
# It searches specified directories for files with a given suffix
# and reports on files that have the same MD5 digest.
# The suffix or suffixes to be searched for are specified by the first
# command-line argument - each suffix separated from the next by a vertical bar.
# The subsequent command-line arguments specify the directories to be searched.
# If no directories are specified on the command-line,
# it searches the current directory.
# Files whose names start with "._" are ignored.
#
# Cameron Hayne (macdev@hayne.net) January 2006 (revised March 2006)
#
#
# Examples of use:
# ----------------
# findDupeFiles '.aif|.aiff' AAA BBB CCC
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the directories AAA, BBB, and CCC
#
# findDupeFiles '.aif|.aiff'
# would look for duplicates among all the files with ".aif" or ".aiff" suffixes
# under the current directory
#
# findDupeFiles '' AAA BBB CCC
# would look for duplicates among all the files (no matter what suffix)
# under the directories AAA, BBB, and CCC
#
# findDupeFiles
# would look for duplicates among all the files (no matter what suffix)
# under the current directory
# -----------------------------------------------------------------------------

use strict;
use warnings;

use File::Find;
use File::stat;
use Digest::MD5;
use Fcntl;

#REMOVE WHEN WE MERGE - UNNECESSARY
my $debug=0;

my $matchSomeSuffix;
if (defined($ARGV[0])) {
my @suffixes = split(/\|/, $ARGV[0]);
if (scalar(@suffixes) > 0) {
my $matchExpr = join('||', map {"m/\$suffixes[$_]\$/io"} 0..$#suffixes);
$matchSomeSuffix = eval "sub {$matchExpr}";
}
shift @ARGV;
}

my @searchDirs = @ARGV ? @ARGV : ".";
foreach my $dir (@searchDirs) {
die "\"$dir\" is not a directory\n" unless -d "$dir";
}
my %filesByDataLength;

sub calcMd5($) {

my ($filename) = @_;
if (-d $filename) {
return "unsupported";
}
sysopen(FILE, $filename, O_RDONLY) or die "Unable to open file \"$filename\": $!\n";
binmode(FILE);
my $md5 = Digest::MD5->new->addfile(*FILE)->hexdigest;
close(FILE);
return $md5;
}

sub hashByMd5($) {

my ($fileInfoListRef) = @_;
my %filesByMd5;
foreach my $fileInfo (@{$fileInfoListRef}) {
my $dirname = $fileInfo->{dirname};
my $filename = $fileInfo->{filename};
my $md5 = calcMd5("$dirname/$filename");
push(@{$filesByMd5{$md5}}, $fileInfo);
}
return \%filesByMd5;
}

sub checkFile() {

return unless -f $_;
my $filename = $_;
my $dirname = $File::Find::dir;
return if $filename =~ /^\._/;
if (defined($matchSomeSuffix)) {
return unless &$matchSomeSuffix;
}
my $statInfo = stat($filename) or warn "Can't stat file \"$dirname/$filename\": $!\n" and return;
my $size = $statInfo->size;
my $fileInfo = { 'dirname' => $dirname,
'filename' => $filename,
};
push(@{$filesByDataLength{$size}}, $fileInfo);
}

MAIN: {

find(\&checkFile, @searchDirs);
my $numDupes = 0;
my $numDupeBytes = 0;
if ( $debug ) {
print "Dupe Checking\n";
} else {
print "Dupe Checking - ";
}
foreach my $size (sort {$b<=>$a} keys %filesByDataLength) {
my $numSameSize = scalar(@{$filesByDataLength{$size}});
next unless $numSameSize > 1;
if ( $debug ) {
print "size: $size numSameSize: $numSameSize\n";
}
my $filesByMd5Ref = hashByMd5($filesByDataLength{$size});
my %filesByMd5 = %{$filesByMd5Ref};
foreach my $md5 (keys %filesByMd5) {
my @sameMd5List = @{$filesByMd5{$md5}};
my $numSameMd5 = scalar(@sameMd5List);
next unless $numSameMd5 > 1;
my $rsrcMd5;
my $dupe_counter=0;
foreach my $fileInfo (@sameMd5List) {
my $dirname = $fileInfo->{dirname};
my $filename = $fileInfo->{filename};
my $filepath = "$dirname/$filename";
if ( $dupe_counter == 0 ) {
if ( $debug ) {
print "KEEPING $filepath - MD5 $md5\n";
}
$dupe_counter++;
} else {
if ( $debug ) {
print "DELETING $filepath - MD5 $md5\n";
} else {
print "D";
}
unlink("$filepath");
}
}
if ( $debug) {
print "----------\n";
}
$numDupes += ($numSameMd5 - 1);
$numDupeBytes += ($size * ($numSameMd5 - 1));
}
}
print "----------\n";
my $numDupeMegabytes = sprintf("%.1f", $numDupeBytes / (1024 * 1024));
print "Number of duplicate files: $numDupes\n";
print "Estimated Mb Savings: $numDupeMegabytes\n";
}



, Mike