iThreads Join is Running Better Then Detach

In my last blog I explained why I was using thread->detach() instead of thread->join(). In a nutshell, to get the crawler too overcome the pitfalls it runs into on web pages, requires testing on the running thread. Why use thread->join() (which is supposed to do all the testing and and take care of all the loose ends for me) if I still need to run tests on the active (or recently inactive - out of scope) thread? Well, I found that if I'm selective about which pages to crawl, then a crawler that uses thread->join() runs more smoothly (and faster). Here's the code I've finally settled on:

use strict;	
use warnings;
use LWP::Simple;
use LWP::UserAgent;
use Time::HiRes qw (sleep);
use threads();
use threads::shared();

#Now we will define variables, “links” is the list we are using. “cur_link” is the link of current page
# “var” is used to take in the page content.
my(@links,%crawled,$cur_link,$var,$link_var,$temp,$pic,$ua);
$ua = new LWP::UserAgent;
$ua->timeout(10);

our $request;
our $response;	
my $ArrCntr = -1;
my @UrlArr;
$UrlArr[0] = "http://www.marvel.com";
$UrlArr[1] = "http://cwtv.com/";
$UrlArr[2] = "http://exoticspotter.com/";
$UrlArr[3] = "http://maserati.com/";
$UrlArr[4] = "http://formula1.ferrari.com/history";
$UrlArr[5] = "http://stylebistro.com/";
$UrlArr[6] = "http://soulnaked.com/";
$UrlArr[7] = "http://blingee.com/blingee";
$UrlArr[8] = "http://www.world-of-waterfalls.com/";
$UrlArr[9] = "http://www.dailymail.co.uk/femail/article-2629235/Beautiful-Burlesque-Dancers-captured-camera-stunning-series-New-York-based-photographer.html";
$UrlArr[10] = "http://ilovegermanshepherds.tumblr.com/";
$UrlArr[11] = "http://readme.ru/";
$UrlArr[12] = "http://www.chevrolet.com/";
$UrlArr[13] = "http://racermag.kinja.com/the-most-beautiful-racecars-of-all-time-its-your-choi-1506825620";
$UrlArr[14] = "http://www.airliners.net/search/photo.search?album=20939";
$UrlArr[15] = "http://simplylingerie.tumblr.com/";
$UrlArr[16] = "http://www.cutestpaw.com/articles/50-cute-cats-make-your-life-happier/";
$UrlArr[17] = "http://www.cutestpaw.com/tag/cats/";
$UrlArr[18] = "http://halloffame.hooters.com/";
$UrlArr[19] = "http://copypast.ru/2007/09/11/krasivye_zhenshhiny_87_foto.html";
$UrlArr[20] = "http://dccomics.com";
$UrlArr[21] = "http://relax.ru/post/106604/Poslednie-fotografii-bespodobnoy-Dzhennifer-Eniston-ot-zhurnala-Hollywood-Reporter.html?feed=new";
$UrlArr[22] = "http://just-lingerie.tumblr.com/";
$UrlArr[23] = "http://community-info.org/DavePics/pics.html";
$UrlArr[24] = "http://relax.ru";
$UrlArr[25] = "http://qip.ru";
$UrlArr[26] = "http://ferrari.com";
$UrlArr[27] = "http://swimsuit.si.com/swimsuit/models/kate-upton/photos/1";
$UrlArr[28] = "http://www.elle.com/";
$UrlArr[29] = "http://www.cybernetplaza.com/formal-dresses.asp?gclid=CJf3jNzw074CFZBxOgodAU4Adw";
$UrlArr[30] = "http://www.racingsportscars.com/make/photo/Maserati.html";
$UrlArr[31] = "http://www.pinterest.com/kravitzt/pin-up-cheesecake-photos/";
$UrlArr[32] = "http://www.refinery29.com/53717?utm_source=taboola&utm_medium=adsales&utm_content=beauty_slideshows#slide";
$UrlArr[33] = "http://bendelaney.me";
$UrlArr[34] = "http://www.bugatti.com/en/tradition/100-years-of-bugatti/stories-of-a-century/from-the-racetrack-to-the-road.html";
$UrlArr[35] = "http://deviantart.com";
$UrlArr[36] = "http://www.bwotd.com/category/clothed/";
$UrlArr[37] = "http://huffingtonpost.com";
$UrlArr[38] = "http://sportscarheaven.tumblr.com/";
$UrlArr[39] = "http://community-info.org/CitronGallery/AllPics.html";
$UrlArr[40] = "http://brasonly.tumblr.com/";
$UrlArr[41] = "http://lovefrenchbulldogs.tumblr.com/";

while (++$ArrCntr > -1 )
{    
   if ($ArrCntr > scalar(@UrlArr))
   {
      $ArrCntr = 0;
   }
   unshift(@links, $UrlArr[$ArrCntr]);
   sleep(1.00);   # wait for system clean up
   my ($thr) = threads->create(\&crawl);
   my @ReturnData = $thr->join();
   print('Thread returned ', join(', ', @ReturnData), "\n");
}

sub crawl
{                               
                $cur_link = shift(@links);
                ++$ArrCntr if not defined $cur_link;
                if ($ArrCntr > scalar(@UrlArr))
                {
                    $ArrCntr = 0;
                }
                $cur_link = $UrlArr[$ArrCntr] if not defined $cur_link;

                $crawled{$cur_link} = 1 if defined $cur_link;

		if ($request = new HTTP::Request('GET', $cur_link))				
                {
		   #print "Get worked\n";
                }
                else
		{
		   #print "Get failed\n";
		   threads->exit();
		   return 0;
		}
               	#print "Now get a response request\n";
		sleep(1.00);
		$response = $ua->simple_request($request);
                if ($response->is_success) 
		{
                   #print "Got response\n"; #$response->decoded_content;
                }
                else 
		{
                   #print "Request failed\n";				  
		   threads->exit();				   
                }    
                #print "Get the page contents\n";     
                $var = $response->content();
                $link_var = $var;   
                #print "parse the image tags out of the content\n";
                my @p_pics =$var =~ /<img src=\"[^>]+>/g;
                #if there are are no images on this page, skip it.
                my $arraySize = scalar(@p_pics);

                my $source = "";
		my $cntr = 0;
                foreach $temp(@p_pics)          
                {                         	
                     my $local_temp = substr $temp, 10;  					 
                     my $char_pos = index($local_temp, '"');					
                     $temp = substr $local_temp, 0, $char_pos; 					
                     if(index($temp, "http") == -1)
                     {
		          my $first = substr($temp, 0, 1);
                          if ($first eq '/')
                          {
                             $temp=$cur_link.$temp;
                          }
                          elsif ($first eq '.')
                          {
                              $temp = substr($temp, 3);
                              my $result = rindex($temp, '/');
                              $temp = substr($temp, 0, $result);
                              $temp = $cur_link.$temp;
                          }
                          else
                          {
                             $temp=$cur_link.'/'.$temp;
                          }
                     } 
                     $temp =~ /\bhttps?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/;         
                     # Only interested in files that are > 64K in size
                     my $size = 0;                     
                     $size = head($temp);                     
                     #print temp to a file so a web page can use it as the src for an img tag.   
                     if ((defined $size) && ($size > 65536))
                     {               
                        open (MYFILE, '>data.txt'); 
                        print MYFILE $temp;
                        close (MYFILE);  
                        print "Just wrote ".$temp." to data.txt\n";
                        sleep(0.25);
			#print "Just slept for 0.25 seconds\n";
                     }
		     else
		     {
		        #print "file is to small too use\n";
		     }
		     #print "At the bottom of the loop\n";
		     #print "$cntr\n";
	             #print "$arraySize\n";
		     if (++$cntr >= scalar(@p_pics))
		     {
			#print "Exiting loop\n";
			last;
		     }
                }               	                    
}

I just felt kind of guilty about that last blog. Yes, that's the code that will run robustly, but it's not what I'm running now (though it had had been running for several days when I finally decided to ax it). The code above is what's generating the slide show you see on my homepage.

Return To My Blog Page Return To My Programming Page