asgsoft Posted May 22, 2007 Share Posted May 22, 2007 Hi I have a script that scans for all the indexed urls for a webpage. However at the moment it only scans the first 100 by going to http://www.google.com/search?q=site:http://www.site.com/&num=100 I know that I can change the pages by changing the value of start= at the end. However how can I make it to show all the indexed pages? any ideas? thanks. The code I am using is below. <?php include "func.php"; $_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100"; $site = strip_tags($_GET["site"]); //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ $oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { $PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); ?> Quote Link to comment Share on other sites More sharing options...
asgsoft Posted May 25, 2007 Author Share Posted May 25, 2007 any ideas? Quote Link to comment Share on other sites More sharing options...
ToonMariner Posted May 25, 2007 Share Posted May 25, 2007 increas 100 to 100000? Quote Link to comment Share on other sites More sharing options...
per1os Posted May 25, 2007 Share Posted May 25, 2007 <?php include "func.php"; function getGoogleLinks($site) { //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ $oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { $PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); } $_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100"; $i=1; while ($i < 10) { $site = "http://www.google.com/search?q=site:http://www.site.com/&num=" . ($i*100); getGoogleLinks(strip_tags($_GET["site"])); $i++; } ?> How about that? Quote Link to comment Share on other sites More sharing options...
asgsoft Posted May 26, 2007 Author Share Posted May 26, 2007 thanks frost for your input but when I run the script I don't receive an output. Quote Link to comment Share on other sites More sharing options...
per1os Posted May 26, 2007 Share Posted May 26, 2007 <?php function getGoogleLinks($site) { //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ //$oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { //$PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); } $i=1; while ($i < 10) { $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100); getGoogleLinks(strip_tags($site)); $i++; } ?> I just ran this, it worked fine. I think I messed up with the $_GET["site"] part. Quote Link to comment Share on other sites More sharing options...
asgsoft Posted May 26, 2007 Author Share Posted May 26, 2007 OK thanks very much However, do you have any idea why PR function stopped working? Quote Link to comment Share on other sites More sharing options...
per1os Posted May 26, 2007 Share Posted May 26, 2007 Yea I commented it out because i didn't have the func.php here is the correct code with that pr working. <?php function getGoogleLinks($site) { //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ $oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { //$PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); } $i=1; while ($i < 10) { $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100); getGoogleLinks(strip_tags($site)); $i++; } ?> Quote Link to comment Share on other sites More sharing options...
asgsoft Posted May 27, 2007 Author Share Posted May 27, 2007 OK thank you very much Quote Link to comment Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.