Jump to content

Scanning all google indexed pages


asgsoft

Recommended Posts

Hi

 

I have a script that scans for all the indexed urls for a webpage. However at the moment it only scans the first 100 by going to http://www.google.com/search?q=site:http://www.site.com/&num=100

 

I know that I can change the pages by changing the value of start= at the end.

 

However how can I make it to show all the indexed pages? any ideas?

 

thanks.

 

The code I am using is below.

 

<?php
include "func.php";
$_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100";
$site = strip_tags($_GET["site"]);
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
     while($counter != $count)
{
    $data[$counter] = ereg_replace('<', "<", $data[$counter]);
    $data[$counter] = ereg_replace('>', ">", $data[$counter]);

    /*
    Made a mistake with searching for mailto and hrefs.. got them
    mixed up so i had to swap them over..!
    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
     {
     $links[2][$l2] = $data[$counter];
$l2++;
     }    
    else if((strpos($data[$counter], "href")) > -1)//Norm link
     {
     $links[1][$l1] = $data[$counter];
$l1++;
     } 
    else if((strpos($data[$counter], "name")) > -1)//Return link
     {
     $links[3][$l3] = $data[$counter];
$l3++;
     } 
    else//Error links
     {
     $links[4][$l4] = $data[$counter];
$l4++;
     } 
    $counter++; 
}
     if($links[1] == "")//If the array hasnt been set, set it
{
    $links[1] = 0;
} 
     if($links[2] == "")
{
    $links[2] = 0;
} 
     if($links[3] == "")
{
    $links[3] = 0;
} 
     if($links[4] == "")
{
    $links[4] = 0;
}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
$oPR=new PageRankXor32();

$aUrl=explode("\n",$location);
//Step thru array of urls
foreach ($aUrl as $location) {
	//Trim off any whitespace
	$location=trim($location);
	//Ignore blank lines
	if ($location!='') {
		if (isUrlValid($location)) {
			$PageRank=$oPR->getRank($location);
			++$locationCount;
		} else {
			$PageRank=$BadUrlText;
		}
	}
	//Limit the number of urls allowed
	if ($locationCount >= $MaxUrls) {
		break;
	}
}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 

?>

Link to comment
https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/
Share on other sites

<?php
include "func.php";

function getGoogleLinks($site) {
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
     while($counter != $count)
{
    $data[$counter] = ereg_replace('<', "<", $data[$counter]);
    $data[$counter] = ereg_replace('>', ">", $data[$counter]);

    /*
    Made a mistake with searching for mailto and hrefs.. got them
    mixed up so i had to swap them over..!
    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
     {
     $links[2][$l2] = $data[$counter];
$l2++;
     }    
    else if((strpos($data[$counter], "href")) > -1)//Norm link
     {
     $links[1][$l1] = $data[$counter];
$l1++;
     } 
    else if((strpos($data[$counter], "name")) > -1)//Return link
     {
     $links[3][$l3] = $data[$counter];
$l3++;
     } 
    else//Error links
     {
     $links[4][$l4] = $data[$counter];
$l4++;
     } 
    $counter++; 
}
     if($links[1] == "")//If the array hasnt been set, set it
{
    $links[1] = 0;
} 
     if($links[2] == "")
{
    $links[2] = 0;
} 
     if($links[3] == "")
{
    $links[3] = 0;
} 
     if($links[4] == "")
{
    $links[4] = 0;
}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
$oPR=new PageRankXor32();

$aUrl=explode("\n",$location);
//Step thru array of urls
foreach ($aUrl as $location) {
	//Trim off any whitespace
	$location=trim($location);
	//Ignore blank lines
	if ($location!='') {
		if (isUrlValid($location)) {
			$PageRank=$oPR->getRank($location);
			++$locationCount;
		} else {
			$PageRank=$BadUrlText;
		}
	}
	//Limit the number of urls allowed
	if ($locationCount >= $MaxUrls) {
		break;
	}
}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 
}

$_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100";

$i=1;
while ($i < 10) {
   $site = "http://www.google.com/search?q=site:http://www.site.com/&num=" . ($i*100);
   getGoogleLinks(strip_tags($_GET["site"]));
   $i++;
}

?>

 

How about that?

<?php

function getGoogleLinks($site) {
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
	while($counter != $count)
	{
		$data[$counter] = ereg_replace('<', "<", $data[$counter]);
		$data[$counter] = ereg_replace('>', ">", $data[$counter]);

		    /*
		    Made a mistake with searching for mailto and hrefs.. got them
		    mixed up so i had to swap them over..!
		    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
	     {
	     $links[2][$l2] = $data[$counter];
		$l2++;
	     }    
	    else if((strpos($data[$counter], "href")) > -1)//Norm link
	     {
	     $links[1][$l1] = $data[$counter];
		$l1++;
	     } 
	    else if((strpos($data[$counter], "name")) > -1)//Return link
	     {
	     $links[3][$l3] = $data[$counter];
		$l3++;
	     } 
	    else//Error links
	     {
	     $links[4][$l4] = $data[$counter];
		$l4++;
	     } 
	    $counter++; 
	}

     if($links[1] == "")//If the array hasnt been set, set it
	{
	    $links[1] = 0;
	} 
	     if($links[2] == "")
	{
	    $links[2] = 0;
	} 
	     if($links[3] == "")
	{
	    $links[3] = 0;
	} 
	     if($links[4] == "")
	{
	    $links[4] = 0;
	}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
//$oPR=new PageRankXor32();

	$aUrl=explode("\n",$location);
	//Step thru array of urls
	foreach ($aUrl as $location) {
		//Trim off any whitespace
		$location=trim($location);
		//Ignore blank lines
		if ($location!='') {
			if (isUrlValid($location)) {
				//$PageRank=$oPR->getRank($location);
				++$locationCount;
			} else {
				$PageRank=$BadUrlText;
			}
		}
		//Limit the number of urls allowed
		if ($locationCount >= $MaxUrls) {
			break;
		}
	}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 
}

$i=1;
while ($i < 10) {
   $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100);
   getGoogleLinks(strip_tags($site));
   $i++;
}

?>

 

I just ran this, it worked fine. I think I messed up with the $_GET["site"] part.

Yea I commented it out because i didn't have the func.php here is the correct code with that pr working.

 

<?php

function getGoogleLinks($site) {
//Get the web site contents
$info = file_get_contents($site);
//Set vars
$stop    = "no";
$spos    = 0;//Where to carry on from
$pos1    = -1;//Front of string to look for
$pos2    = -1;//Back of string to look for
$data    = array();
$links = array(); //the data sorted
$counter = -1;
$l1     = 0;
$l2     = 0;
$l3     = 0;
$l4     = 0; 
$count = 0;
//Set up while loop
while($stop != "yes")
{
    //Set front and back pos
    $pos1 = strpos($info, "<a", $spos);
    $pos2 = strpos($info, "</a>", $spos);
if(($pos1 > -1) AND ($pos2 > -1))
{
    //Inc counter
    $counter++;
    //Store in array
    $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4);
    //Set where to start of from
    $spos = $pos2+4;
} 
else
{
    //Stop loop
    $stop = "yes";
    $count = count($data);
    $counter = 0;
	while($counter != $count)
	{
		$data[$counter] = ereg_replace('<', "<", $data[$counter]);
		$data[$counter] = ereg_replace('>', ">", $data[$counter]);

		    /*
		    Made a mistake with searching for mailto and hrefs.. got them
		    mixed up so i had to swap them over..!
		    */

         if((strpos($data[$counter], "mailto")) > -1)//Mail to link
	     {
	     $links[2][$l2] = $data[$counter];
		$l2++;
	     }    
	    else if((strpos($data[$counter], "href")) > -1)//Norm link
	     {
	     $links[1][$l1] = $data[$counter];
		$l1++;
	     } 
	    else if((strpos($data[$counter], "name")) > -1)//Return link
	     {
	     $links[3][$l3] = $data[$counter];
		$l3++;
	     } 
	    else//Error links
	     {
	     $links[4][$l4] = $data[$counter];
		$l4++;
	     } 
	    $counter++; 
	}

     if($links[1] == "")//If the array hasnt been set, set it
	{
	    $links[1] = 0;
	} 
	     if($links[2] == "")
	{
	    $links[2] = 0;
	} 
	     if($links[3] == "")
	{
	    $links[3] = 0;
	} 
	     if($links[4] == "")
	{
	    $links[4] = 0;
	}          
} 
} 
$counter = 0; 

while($counter != $l1)
{
    $location = $links[1][$counter];//the link
$pos1 = strpos($location, "href=\"");//Find href="
if($pos1 == "")
{//If not 
    $pos1 = strpos($location, "href='");// find href='
}
if($pos1 == "")
{//If not
    $pos1 = strpos($location, "href=");//find href=
    $pos1--;//Minus to make up for the missing " or '
} 
$pos1 = $pos1+6;
$pos2 = strpos($location, "\"", $pos1);//Find "
if($pos2 == "")
{//if not
    $pos2 = strpos($location, "'", $pos1);//Find '
}
if($pos2 == "")
{//if not
    $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer
    $tpos2 = strpos($location, ">", $pos1);

     if($tpos1 < $tpos2)//Which is bigger?
{
    $pos2 = strpos($location, " ", $pos1);
}
else
{
    $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code
} 
} 

$location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address

if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){
$oPR=new PageRankXor32();

	$aUrl=explode("\n",$location);
	//Step thru array of urls
	foreach ($aUrl as $location) {
		//Trim off any whitespace
		$location=trim($location);
		//Ignore blank lines
		if ($location!='') {
			if (isUrlValid($location)) {
				//$PageRank=$oPR->getRank($location);
				++$locationCount;
			} else {
				$PageRank=$BadUrlText;
			}
		}
		//Limit the number of urls allowed
		if ($locationCount >= $MaxUrls) {
			break;
		}
	}

echo($location."<strong> $PageRank</strong>\n<br />");//Show it

}  $links[1][$counter] = $location;
$counter++;//Next..!
}     
if($l1 == 0)
{
    echo("\n<br />None found");
}    
echo("\n\n<br />"); 
}

$i=1;
while ($i < 10) {
   $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100);
   getGoogleLinks(strip_tags($site));
   $i++;
}

?>

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.