Simplified Crawler

Scroll down to see the rest of this webpage

Play Classical Gas here

 In my last blog I included the code I wrote for the web crawler that powers my homepage slideshow. When I wrote it I was worried about things like dead websites, bad links, malicious code, etc. I through out most of that error correcting code, and made the crawler a lot simpler; it works better, too.

		#!/usr/bin/perl
use strict;	
use warnings;
use Time::HiRes ('sleep');
use LWP::Simple;
use LWP::UserAgent;
use Net::FTP;
use lib '/usr/local/share/perl/5.30.3/auto/Data/Validate/URI';
use Data::Validate::URI;
use Try::Tiny;

my(@links,%crawled,$cur_link,$var,$link_var,$temp,$pic,$ua,$index);
$ua = new LWP::UserAgent;
$ua->timeout(120);
my $uriValidator = new Data::Validate::URI();
my $bell = chr(7);

my @UrlArr;
$UrlArr[0] = "http://kingsizejuggs.com";
$UrlArr[1] = "http://allnude.sexy";
$UrlArr[2] = "http://sexygirlspics.com";
$UrlArr[3] = "http://bigtitsbreasts.com/";
$UrlArr[4] = "http://maturenudewomen.net";
$UrlArr[5] = "http://community-info.lorraineprofeta.us/DavePics/OpenPussy/OpenPussy.html";
$UrlArr[6] = "http://community-info.lorraineprofeta.us/DavePics/AllPics.html";
$UrlArr[7] = "http://community-info.lorraineprofeta.us/DavePics/SSJ/Pictures.html";
$UrlArr[8] = "http://community-info.lorraineprofeta.us/DavePics/AlbertoAscari/AlbertoAscari.html";
$UrlArr[9] = "http://community-info.org/DavePics/ExcaliburSS/ExcaliburSS.html";
$UrlArr[10] = "http://community-info.org/DavePics/StutzBearcat/StutzBearcat.html";
$UrlArr[11] = "http://community-info.org/DavePics/Ferrari330P4/Ferrari330P4.html";
$UrlArr[12] = "http://www.ultimatecarpage.com";
$UrlArr[13] = "http://wallpaperswide.com";
$UrlArr[14] = "http://maturehomemadeporn.com";
$UrlArr[15] = "http://aepics.com";
$UrlArr[16] = "http://tophugeboobs.com";
$UrlArr[17] = "http://nudemodels.sexy";
$UrlArr[18] = "http://bigtitswebcams.net";

for (my $indexCntr = 0; $indexCntr < scalar(@UrlArr); $indexCntr++)
{
   push(@links, $UrlArr[$indexCntr]);
   print $UrlArr[$indexCntr]."\n";
} 

foreach $cur_link (@links)
{
        if($cur_link=~/^http/)
        {
	            # in the next few lines, we retrieve the page content
                
                chomp($cur_link);
                $cur_link =~ s/\r$//;                               
                $crawled{$cur_link} = 1 if defined $cur_link;
                print "Just got crawled value\n";               
	            my $request = new HTTP::Request('GET', $cur_link);
                #print "Just made request to web page: $!\n";
                my $response;
                if ($response = $ua->request($request))
                {
                    print "Just got a response from the web page\n";
                }
                else
                {
	               print "$!\n";
                }   
                if ($response->is_success) 
                {
                   my $message = $response->decoded_content;
                   print "Received reply ".$message."\n";
                }
                else 
                {
                   print "HTTP GET error code: ".$response->code."\n";
                   print "HTTP GET error message: ".$response->message."\n";
                   next;
                }        
                #print "Get the page contents\n";     
                $var = $response->content();
                $link_var = $var;   
                #print "parse the image tags out of the content\n";
                my @p_pics =$var =~ /<img src=\"[^>]+>/g;
                #print @p_pics;
                #if ther are are no images on this page, skip it.
                try
                {
                   my $arraySize = scalar(@p_pics);
                   if ($arraySize < 1)
                   {
					   next;
				   }
				   else
				   {
					   print "This page has ".$arraySize." images\n";
				   }
                }
                catch
                {
                   print "$!\n";
			    };
                
                foreach $temp(@p_pics)          
                {      
					 $index = index($temp, "powweb");
					 if ($index > -1)
					 {
						 next;
				     } 
                     my $local_temp = substr $temp, 10;  
                     my $char_pos = index($local_temp, '"');
                     $temp = substr $local_temp, 0, $char_pos;  
                     if(index($temp, "http") == -1)
                       {
		          my $first = substr($temp, 0, 1);
                          if ($first eq '/')
                          {
                             $temp=$cur_link.$temp;
                          }
                          elsif ($first eq '.')
                          {
                              $temp = substr($temp, 3);
                              my $result = rindex($temp, '/');
                              $temp = substr($temp, 0, $result);
                              $temp = $cur_link.$temp;
                          }
                          else
                          {
                             $temp=$cur_link.'/'.$temp;
                          }
                     } 
                     $temp =~ /\bhttp?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/;         
                     
                           open (MYFILE, '>data.txt'); 
                           print MYFILE $temp;
                           close (MYFILE);  
                           print "Just wrote ".$temp." to data.txt\n";
                           my $file = 'data.txt'; 
                           my $host = 'ip address of host server';
                           my $user = 'username';
                           my $pass = 'password';
                           my $dir  = 'directory for community-info.org on host server';
                           my $ftp  = Net::FTP->new($host, Debug => 0);
                           print $bell;
                           try
                           {
							  #lots of sites use transparent.gif files between images
							  $index = index($temp, "trans");
							  if ($index == -1)
							  { 
								 $index = index($temp,"powweb");
								 if ($index == -1)
								 { 
                                    $ftp->login($user, $pass);
                                    $ftp->cwd($dir);
                                    $ftp->put($file);
                                    print "Just uploaded data.txt\n";
                                    sleep(1.5);
							     }
							     else
							     {
									 print "Didn't upload ".$temp."\n";
								 }
							  }
							  else
							  {
								  print "Didn't upload data.txt because it is a transparency\n";
							  }
                           }
                           catch
                           {
							   print "failed to upload data.txt\n";
						   }
						   finally
						   {
							   $ftp-> quit;
						   };
                }                
                print "\nCurrently Scanning -- ".$cur_link;
	            # In the next line we extract all links
                my @p_links = $var=~/<a href=\"(.*?)\">/g;
                foreach $temp(@p_links)
                {       
                        if((!($temp=~/^http/))&&($temp=~/^\//))
                        {
			                    #This part of the code lets us correct internal addresses
                                $temp=$cur_link.$temp;
                        }
		            # In the next line we add the links to the links list.
		            print "We are going to add ".$temp." to the links array\n";
		            chomp($temp);
                    $temp =~ s/\r$//;
                    if ($uriValidator->is_web_uri($temp))
                    {
				       push(@links,$temp) if not defined $crawled{$temp}; 
				       print "just added ".$temp." to the end of the links list\n";
				    }
				    else
				    {
						print "Didn't add ".$temp." because it is not a valid url.\n";
					}
				    #print "now the links array looks like: \n";
				    #foreach $temp (@links)
				    #{
					#	print $temp."\n";
				    #}
				       
                }
                #get rid of the top element of the links list, so we don't run out of mem
				$temp = shift(@links);
				print "just removed ".$temp." from the front of the links list\n";
        }
}
    

 I recently updated gas.html, too. Got rid of the problem with non-displayable images; I now display an image from my home directory. Made a few more changes to the coding that also speeds up the slideshow (by simplifying, or optmizing, the code). Here is my new code for gas.html:

<!DOCTYPE html>
<html>
<body>

<div>
<img id="imageObject" src="http://community-info.org/images/Sophia-Loren-Mercedes-Benz-300SL-Gullwing-1955.jpg">
</div>

<script>
function loadDoc(url, myFunction) 
{
  var xhttp;
  xhttp=new XMLHttpRequest();
  xhttp.onreadystatechange = function() {
    if (this.readyState == 4 && this.status == 200) {
      myFunction(this);
    }
  };
  var newURL = url+'?'+Math.random()*Math.random();
  xhttp.open("GET", newURL, true);
  xhttp.setRequestHeader('Cache-Control', 'no-cache');
  xhttp.send();
}
function myFunction(xhttp) 
{
  text = xhttp.responseText;
  if ((text.toLowerCase().indexOf("http") == -1) || (text.toLowerCase().indexOf("trans.gif") > -1))
  {
     text = "http://community-info.org/DavePics/pic05.jpg";
  }
  if (text.toLowerCase().indexOf('trans') > -1)
  {
     text = "http://community-info.org/DavePics/MB300SLR.jpg";
  }
  if (text.length < 12)
  {
      text = "http://lustfulbabespics.com/g/b4f563/th_03.jpg";
  }
  var img = new Image();
  
  img.onload = function()
	{
			var height = img.height;
			var width = img.width;
			var imgDivWidth = 1.1 * document.body.offsetWidth;			
						
			if (width < 100)
			{
			   document.getElementById("imageObject").height = 540;
			   document.getElementById("imageObject").width = 490;
			   document.getElementById("imageObject").src = "http://community-info.org/DavePics/BettyPageBikini.jpg";
			   document.getElementById("imageObject").src = text;
			}
			else if (width > 490)
			{	              			                  	   
			   document.getElementById("imageObject").style.height =  height * imgDivWidth/width;
			   document.getElementById("imageObject").style.width  =  imgDivWidth;
               document.getElementById("imageObject").src = text;
			}
			else
			{
			   document.getElementById("imageObject").height = height;
			   document.getElementById("imageObject").width = width;
               document.getElementById("imageObject").src = text;
			}
  }
  img.src = text;
  setTimeout( "loadDoc('data.txt', myFunction)",10);
}
loadDoc('data.txt', myFunction);
</script>
</body>
</html>

Return To My Blog Page       Return To My Programming Page