Simplified Crawler

Scroll down to see the rest of this webpage

Play Classical Gas here

 In my last blog I included the code I wrote for the web crawler that powers my homepage slideshow. When I wrote it I was worried about things like dead websites, bad links, malicious code, etc. I through out most of that error correcting code, and made the crawler a lot simpler; it works better, too.

#!/usr/bin/perl
use strict;	
use warnings;
use Time::HiRes ('sleep');
use LWP::Simple;
use LWP::UserAgent;
use Net::FTP;
use Try::Tiny;

my(@links,%crawled,$cur_link,$var,$link_var,$temp,$pic,$ua,$index);
$ua = new LWP::UserAgent;
$ua->timeout(120);

my @UrlArr;
$UrlArr[0] = "http://www.ultimatecarpage.com";
$UrlArr[1] = "http://fuckyeahferrari.tumblr.com";
$UrlArr[2] = "http://a2zracer.com";
$UrlArr[3] = "http://community-info.lorraineprofeta.us/DavePics/AllPics.html";
$UrlArr[4] = "http://community-info.lorraineprofeta.us/DavePics/SSJ/Pictures.html";
$UrlArr[5] = "http://community-info.lorraineprofeta.us/DavePics/AlbertoAscari/AlbertoAscari.html";
$UrlArr[6] = "http://community-info.org/DavePics/ExcaliburSS/ExcaliburSS.html";
$UrlArr[7] = "http://community-info.org/DavePics/StutzBearcat/StutzBearcat.html";
$UrlArr[8] = "http://community-info.org/DavePics/Ferrari330P4/Ferrari330P4.html";


for (my $indexCntr = 0; $indexCntr < scalar(@UrlArr); $indexCntr++)
{
   push(@links, $UrlArr[$indexCntr]);
   print "$UrlArr[$indexCntr]/n";
} 

foreach $cur_link (@links)
{
        if($cur_link=~/^http/)
        {
	            # in the next few lines, we retrieve the page content
                
                chomp($cur_link);
                $cur_link =~ s/\r$//;                               
                $crawled{$cur_link} = 1 if defined $cur_link;
                print "Just got crawled value\n";               
	            my $request = new HTTP::Request('GET', $cur_link);
                #print "Just made request to web page: $!\n";
                my $response;
                if ($response = $ua->request($request))
                {
                    print "Just got a response from the web page\n";
                }
                else
                {
	               print "$!\n";
                }           
                #print "Get the page contents\n";     
                $var = $response->content();
                $link_var = $var;   
                #print "parse the image tags out of the content\n";
                my @p_pics =$var =~ /]+>/g;
                #print @p_pics;
                #if ther are are no images on this page, skip it.
                my $arraySize = @p_pics;
                #print $arraySize;
                
                my $source = "";
                foreach $temp(@p_pics)          
                {       
                     my $local_temp = substr $temp, 10;  
                     my $char_pos = index($local_temp, '"');
                     $temp = substr $local_temp, 0, $char_pos;  
                     if(index($temp, "http") == -1)
                       {
		          my $first = substr($temp, 0, 1);
                          if ($first eq '/')
                          {
                             $temp=$cur_link.$temp;
                          }
                          elsif ($first eq '.')
                          {
                              $temp = substr($temp, 3);
                              my $result = rindex($temp, '/');
                              $temp = substr($temp, 0, $result);
                              $temp = $cur_link.$temp;
                          }
                          else
                          {
                             $temp=$cur_link.'/'.$temp;
                          }
                     } 
                     $temp =~ /\bhttp?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/;         
                     # Only interested in files that are > 64K in size
                     my($type,$size);
                     $size = 0;
                     $type, $size = head($temp);  
                     #print $size;                   
                     #print "print temp to a file so a web page can use it as the src for an img tag.\n";
                           
                           open (MYFILE, '>data.txt'); 
                           print MYFILE $temp;
                           close (MYFILE);  
                           print "Just wrote ".$temp." to data.txt\n";
                           sleep(0.25);
                           my $file = 'data.txt'; 
                           my $host = 'ip address of host server';
                           my $user = 'username';
                           my $pass = 'password';
                           my $dir  = 'directory of community-info.org on host server';
                           my $ftp  = Net::FTP->new($host, Debug => 0);
                           try
                           {
                              $ftp->login($user, $pass);
                              $ftp->cwd($dir);
                              $ftp->put($file);
                           }
                           catch
                           {
							   print "failed to upload data.txt\n";
						   }
						   finally
						   {
							   $ftp-> quit;
						   };
                           print "Just uploaded data.txt\n";
                }                
                print "\nCurrently Scanning -- ".$cur_link;
	            #In the next line we extract all links in the current page
                my @p_links= $var=~/<a href=\"(.*?)\">/g;
                foreach $temp(@p_links)
                {       
                    if((!($temp=~/^http/))&&($temp=~/^\//))
                    {
			           #This part of the code lets us correct internal addresses
                       $temp=$cur_link.$temp;
                    }
		            #In the next line we add the links
		            chomp($temp);
                    $temp =~ s/\r$//;  
					if ($crawled{$temp} != 1)
					{
						push(@links,$temp);
					}
                }
                #get rid of the top element of the links list, so we don't run out of mem
				@links.shift();
        }
}
    

Return To My Blog Page       Return To My Programming Page