Jump to content

Scanning all google indexed pages


asgsoft

Recommended Posts

Hi

 

I have a script that scans for all the indexed urls for a webpage. However at the moment it only scans the first 100 by going to http://www.google.com/search?q=site:http://www.site.com/&num=100

 

I know that I can change the pages by changing the value of start= at the end.

 

However how can I make it to show all the indexed pages? any ideas?

 

thanks.

 

The code I am using is below.

 

<?php
include "func.php";
$_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100";
$site = strip_tags($_GET["site"]);
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
     while($counter != $count)
{
    $data[$counter] = ereg_replace('<', "<", $data[$counter]);
    $data[$counter] = ereg_replace('>', ">", $data[$counter]);

    /*
    Made a mistake with searching for mailto and hrefs.. got them
    mixed up so i had to swap them over..!
    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
     {
     $links[2][$l2] = $data[$counter];
$l2++;
     }    
    else if((strpos($data[$counter], "href")) > -1)//Norm link
     {
     $links[1][$l1] = $data[$counter];
$l1++;
     } 
    else if((strpos($data[$counter], "name")) > -1)//Return link
     {
     $links[3][$l3] = $data[$counter];
$l3++;
     } 
    else//Error links
     {
     $links[4][$l4] = $data[$counter];
$l4++;
     } 
    $counter++; 
}
     if($links[1] == "")//If the array hasnt been set, set it
{
    $links[1] = 0;
} 
     if($links[2] == "")
{
    $links[2] = 0;
} 
     if($links[3] == "")
{
    $links[3] = 0;
} 
     if($links[4] == "")
{
    $links[4] = 0;
}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
$oPR=new PageRankXor32();

$aUrl=explode("\n",$location);
//Step thru array of urls
foreach ($aUrl as $location) {
	//Trim off any whitespace
	$location=trim($location);
	//Ignore blank lines
	if ($location!='') {
		if (isUrlValid($location)) {
			$PageRank=$oPR->getRank($location);
			++$locationCount;
		} else {
			$PageRank=$BadUrlText;
		}
	}
	//Limit the number of urls allowed
	if ($locationCount >= $MaxUrls) {
		break;
	}
}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 

?>

Link to comment
Share on other sites

<?php
include "func.php";

function getGoogleLinks($site) {
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
     while($counter != $count)
{
    $data[$counter] = ereg_replace('<', "<", $data[$counter]);
    $data[$counter] = ereg_replace('>', ">", $data[$counter]);

    /*
    Made a mistake with searching for mailto and hrefs.. got them
    mixed up so i had to swap them over..!
    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
     {
     $links[2][$l2] = $data[$counter];
$l2++;
     }    
    else if((strpos($data[$counter], "href")) > -1)//Norm link
     {
     $links[1][$l1] = $data[$counter];
$l1++;
     } 
    else if((strpos($data[$counter], "name")) > -1)//Return link
     {
     $links[3][$l3] = $data[$counter];
$l3++;
     } 
    else//Error links
     {
     $links[4][$l4] = $data[$counter];
$l4++;
     } 
    $counter++; 
}
     if($links[1] == "")//If the array hasnt been set, set it
{
    $links[1] = 0;
} 
     if($links[2] == "")
{
    $links[2] = 0;
} 
     if($links[3] == "")
{
    $links[3] = 0;
} 
     if($links[4] == "")
{
    $links[4] = 0;
}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
$oPR=new PageRankXor32();

$aUrl=explode("\n",$location);
//Step thru array of urls
foreach ($aUrl as $location) {
	//Trim off any whitespace
	$location=trim($location);
	//Ignore blank lines
	if ($location!='') {
		if (isUrlValid($location)) {
			$PageRank=$oPR->getRank($location);
			++$locationCount;
		} else {
			$PageRank=$BadUrlText;
		}
	}
	//Limit the number of urls allowed
	if ($locationCount >= $MaxUrls) {
		break;
	}
}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 
}

$_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100";

$i=1;
while ($i < 10) {
   $site = "http://www.google.com/search?q=site:http://www.site.com/&num=" . ($i*100);
   getGoogleLinks(strip_tags($_GET["site"]));
   $i++;
}

?>

 

How about that?

Link to comment
Share on other sites

<?php

function getGoogleLinks($site) {
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
	while($counter != $count)
	{
		$data[$counter] = ereg_replace('<', "<", $data[$counter]);
		$data[$counter] = ereg_replace('>', ">", $data[$counter]);

		    /*
		    Made a mistake with searching for mailto and hrefs.. got them
		    mixed up so i had to swap them over..!
		    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
	     {
	     $links[2][$l2] = $data[$counter];
		$l2++;
	     }    
	    else if((strpos($data[$counter], "href")) > -1)//Norm link
	     {
	     $links[1][$l1] = $data[$counter];
		$l1++;
	     } 
	    else if((strpos($data[$counter], "name")) > -1)//Return link
	     {
	     $links[3][$l3] = $data[$counter];
		$l3++;
	     } 
	    else//Error links
	     {
	     $links[4][$l4] = $data[$counter];
		$l4++;
	     } 
	    $counter++; 
	}

     if($links[1] == "")//If the array hasnt been set, set it
	{
	    $links[1] = 0;
	} 
	     if($links[2] == "")
	{
	    $links[2] = 0;
	} 
	     if($links[3] == "")
	{
	    $links[3] = 0;
	} 
	     if($links[4] == "")
	{
	    $links[4] = 0;
	}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
//$oPR=new PageRankXor32();

	$aUrl=explode("\n",$location);
	//Step thru array of urls
	foreach ($aUrl as $location) {
		//Trim off any whitespace
		$location=trim($location);
		//Ignore blank lines
		if ($location!='') {
			if (isUrlValid($location)) {
				//$PageRank=$oPR->getRank($location);
				++$locationCount;
			} else {
				$PageRank=$BadUrlText;
			}
		}
		//Limit the number of urls allowed
		if ($locationCount >= $MaxUrls) {
			break;
		}
	}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 
}

$i=1;
while ($i < 10) {
   $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100);
   getGoogleLinks(strip_tags($site));
   $i++;
}

?>

 

I just ran this, it worked fine. I think I messed up with the $_GET["site"] part.

Link to comment
Share on other sites

Yea I commented it out because i didn't have the func.php here is the correct code with that pr working.

 

<?php

function getGoogleLinks($site) {
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
	while($counter != $count)
	{
		$data[$counter] = ereg_replace('<', "<", $data[$counter]);
		$data[$counter] = ereg_replace('>', ">", $data[$counter]);

		    /*
		    Made a mistake with searching for mailto and hrefs.. got them
		    mixed up so i had to swap them over..!
		    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
	     {
	     $links[2][$l2] = $data[$counter];
		$l2++;
	     }    
	    else if((strpos($data[$counter], "href")) > -1)//Norm link
	     {
	     $links[1][$l1] = $data[$counter];
		$l1++;
	     } 
	    else if((strpos($data[$counter], "name")) > -1)//Return link
	     {
	     $links[3][$l3] = $data[$counter];
		$l3++;
	     } 
	    else//Error links
	     {
	     $links[4][$l4] = $data[$counter];
		$l4++;
	     } 
	    $counter++; 
	}

     if($links[1] == "")//If the array hasnt been set, set it
	{
	    $links[1] = 0;
	} 
	     if($links[2] == "")
	{
	    $links[2] = 0;
	} 
	     if($links[3] == "")
	{
	    $links[3] = 0;
	} 
	     if($links[4] == "")
	{
	    $links[4] = 0;
	}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
$oPR=new PageRankXor32();

	$aUrl=explode("\n",$location);
	//Step thru array of urls
	foreach ($aUrl as $location) {
		//Trim off any whitespace
		$location=trim($location);
		//Ignore blank lines
		if ($location!='') {
			if (isUrlValid($location)) {
				//$PageRank=$oPR->getRank($location);
				++$locationCount;
			} else {
				$PageRank=$BadUrlText;
			}
		}
		//Limit the number of urls allowed
		if ($locationCount >= $MaxUrls) {
			break;
		}
	}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 
}

$i=1;
while ($i < 10) {
   $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100);
   getGoogleLinks(strip_tags($site));
   $i++;
}

?>

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.