asgsoft Posted May 22, 2007 Share Posted May 22, 2007 Hi I have a script that scans for all the indexed urls for a webpage. However at the moment it only scans the first 100 by going to http://www.google.com/search?q=site:http://www.site.com/&num=100 I know that I can change the pages by changing the value of start= at the end. However how can I make it to show all the indexed pages? any ideas? thanks. The code I am using is below. <?php include "func.php"; $_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100"; $site = strip_tags($_GET["site"]); //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ $oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { $PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); ?> Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/ Share on other sites More sharing options...
asgsoft Posted May 25, 2007 Author Share Posted May 25, 2007 any ideas? Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-261793 Share on other sites More sharing options...
ToonMariner Posted May 25, 2007 Share Posted May 25, 2007 increas 100 to 100000? Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-261808 Share on other sites More sharing options...
per1os Posted May 25, 2007 Share Posted May 25, 2007 <?php include "func.php"; function getGoogleLinks($site) { //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ $oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { $PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); } $_GET["site"] = "http://www.google.com/search?q=site:http://www.site.com/&num=100"; $i=1; while ($i < 10) { $site = "http://www.google.com/search?q=site:http://www.site.com/&num=" . ($i*100); getGoogleLinks(strip_tags($_GET["site"])); $i++; } ?> How about that? Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-261810 Share on other sites More sharing options...
asgsoft Posted May 26, 2007 Author Share Posted May 26, 2007 thanks frost for your input but when I run the script I don't receive an output. Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-261979 Share on other sites More sharing options...
per1os Posted May 26, 2007 Share Posted May 26, 2007 <?php function getGoogleLinks($site) { //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ //$oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { //$PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); } $i=1; while ($i < 10) { $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100); getGoogleLinks(strip_tags($site)); $i++; } ?> I just ran this, it worked fine. I think I messed up with the $_GET["site"] part. Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-262075 Share on other sites More sharing options...
asgsoft Posted May 26, 2007 Author Share Posted May 26, 2007 OK thanks very much However, do you have any idea why PR function stopped working? Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-262137 Share on other sites More sharing options...
per1os Posted May 26, 2007 Share Posted May 26, 2007 Yea I commented it out because i didn't have the func.php here is the correct code with that pr working. <?php function getGoogleLinks($site) { //Get the web site contents $info = file_get_contents($site); //Set vars $stop = "no"; $spos = 0;//Where to carry on from $pos1 = -1;//Front of string to look for $pos2 = -1;//Back of string to look for $data = array(); $links = array(); //the data sorted $counter = -1; $l1 = 0; $l2 = 0; $l3 = 0; $l4 = 0; $count = 0; //Set up while loop while($stop != "yes") { //Set front and back pos $pos1 = strpos($info, "<a", $spos); $pos2 = strpos($info, "</a>", $spos); if(($pos1 > -1) AND ($pos2 > -1)) { //Inc counter $counter++; //Store in array $data[$counter] = substr($info, $pos1, ($pos2-$pos1)+4); //Set where to start of from $spos = $pos2+4; } else { //Stop loop $stop = "yes"; $count = count($data); $counter = 0; while($counter != $count) { $data[$counter] = ereg_replace('<', "<", $data[$counter]); $data[$counter] = ereg_replace('>', ">", $data[$counter]); /* Made a mistake with searching for mailto and hrefs.. got them mixed up so i had to swap them over..! */ if((strpos($data[$counter], "mailto")) > -1)//Mail to link { $links[2][$l2] = $data[$counter]; $l2++; } else if((strpos($data[$counter], "href")) > -1)//Norm link { $links[1][$l1] = $data[$counter]; $l1++; } else if((strpos($data[$counter], "name")) > -1)//Return link { $links[3][$l3] = $data[$counter]; $l3++; } else//Error links { $links[4][$l4] = $data[$counter]; $l4++; } $counter++; } if($links[1] == "")//If the array hasnt been set, set it { $links[1] = 0; } if($links[2] == "") { $links[2] = 0; } if($links[3] == "") { $links[3] = 0; } if($links[4] == "") { $links[4] = 0; } } } $counter = 0; while($counter != $l1) { $location = $links[1][$counter];//the link $pos1 = strpos($location, "href=\"");//Find href=" if($pos1 == "") {//If not $pos1 = strpos($location, "href='");// find href=' } if($pos1 == "") {//If not $pos1 = strpos($location, "href=");//find href= $pos1--;//Minus to make up for the missing " or ' } $pos1 = $pos1+6; $pos2 = strpos($location, "\"", $pos1);//Find " if($pos2 == "") {//if not $pos2 = strpos($location, "'", $pos1);//Find ' } if($pos2 == "") {//if not $tpos1 = strpos($location, " ", $pos1);// find either " " (gap) or > depending which is closer $tpos2 = strpos($location, ">", $pos1); if($tpos1 < $tpos2)//Which is bigger? { $pos2 = strpos($location, " ", $pos1); } else { $pos2 = strpos($location, ">", $pos1);//I used > to allow viewing of the html code } } $location = substr($location, $pos1, ($pos2-$pos1));//At last, get the address if (preg_match("/^http:\/\/www.thewebmasterstool.com/",$location)){ $oPR=new PageRankXor32(); $aUrl=explode("\n",$location); //Step thru array of urls foreach ($aUrl as $location) { //Trim off any whitespace $location=trim($location); //Ignore blank lines if ($location!='') { if (isUrlValid($location)) { //$PageRank=$oPR->getRank($location); ++$locationCount; } else { $PageRank=$BadUrlText; } } //Limit the number of urls allowed if ($locationCount >= $MaxUrls) { break; } } echo($location."<strong> $PageRank</strong>\n<br />");//Show it } $links[1][$counter] = $location; $counter++;//Next..! } if($l1 == 0) { echo("\n<br />None found"); } echo("\n\n<br />"); } $i=1; while ($i < 10) { $site = "http://www.google.com/search?q=site:http://www.thewebmasterstool.com/&num=" . ($i*100); getGoogleLinks(strip_tags($site)); $i++; } ?> Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-262260 Share on other sites More sharing options...
asgsoft Posted May 27, 2007 Author Share Posted May 27, 2007 OK thank you very much Link to comment https://forums.phpfreaks.com/topic/52531-scanning-all-google-indexed-pages/#findComment-262552 Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.