d.shankar Posted August 8, 2007 Share Posted August 8, 2007 Hello all ! I have written a code such that given a webaddress it will find all the links in that page.. The problem is the code is unable to find the nested links. Example: I provide the site www.google.com my code retrieves these files.. index.html about.html My query to you all is that , the code is unable to find the nested links under index.html and about.html here is my code <?php $domain="http://www.google.com"; $links = getLinks($domain,$domain); if(!empty($links->url)) { $arr_urls1=implode(",",$links->url); $durls1=explode(",",$arr_urls1); $xurlsx=array_unique($durls1); $aurls=array(); $i=0; // copying non duplicate entries to a new array to avoid offset error foreach($xurlsx as $strItem){ $aurls[$i]=$strItem; $i++; } // Number of url links //echo "number of url links = " . count( $aurls ) . "<br><br>"; // this loop is used to find the sub links from the found pages. but i cant achieve for($i=0;$i<count($aurls);$i++) { $aurls_len=count($aurls); //echo $aurls[$i]; //echo "<br>"; flush(); //echo "sending...".$aurls[$i]."<br>"; getLinks($aurls[$i],$domain); if(!empty($links->url)) { $arr_urls=implode(",",$links->url); $xurls=explode(",",$arr_urls);//to array //$xurls2=array_unique($temp_urls); /* FYI echo "<hr>"; flush(); for($j=0;$j<count($xurls);$j++) { echo $xurls[$j]; echo "<br>"; flush(); } echo "<hr>"; flush(); */ for($j=0;$j<count($xurls);$j++) { if(in_array($xurls[$j],$aurls)) { //echo "found"; //flush(); } else { //echo "not found"; //flush(); $aurls[$aurls_len]=$xurls[$j]; $aurls_len++; } } unset($xurls); unset($arr_urls); } } echo "<hr>"; for($i=0;$i<count($aurls);$i++) { echo $aurls[$i]; echo "<br>"; } echo "<hr>"; } /* for($i=0;$i<count($xurls);$i++) { echo $xurls[$i]; echo "<br>"; } */ //print_r(array_values($aurls)); //echo implode("<br>",$links0->javascript); //curl_close($ch); function getLinks($url,$host) { $ch = curl_init(); curl_setopt($ch,CURLOPT_URL, $url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch,CURLOPT_FAILONERROR,true); $source = curl_exec($ch); $httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE); echo "<br>Response:".$httpCode." For ".$url."<br>"; curl_close($ch); preg_match_all("/<a (?:.*?)href=\"([^\"]+?)\"(?:[^>]*?)>/si", $source, $result ); //$links->email = array(); //$links->javascript = array(); $links->url = array(); if($httpCode==0 || $httpCode==404 || $httpCode==500 || $httpCode==400) { $links->url[]=""; //break; } else { foreach( $result[1] as $value ) { if( strtolower( substr( $value, 0, 7 ) ) == "mailto:" ) { // $links->email[] = substr( $value, 7 ); } else if( strtolower( substr( $value, 0, 11 ) ) == "javascript:" ) { // $links->javascript[] = substr( $value, 11 ); } else { //index.html,www.xyz.com if(stripos($value,$host)===true) { //$value=$host."/".$value; $links->url[] = $value; } else { if(stripos($value,"http://")===false) { if(substr($value,0,1)=="/") { //substr_replace($value," ",0); $value=$host.$value; $links->url[] = $value; } else { $value=$host."/".$value; $links->url[] = $value; } } } }//else //else }//for }//else return $links; //flush(); }//func ?> Please check the code .. It does not have any errors but it fails to find further links. Quote Link to comment https://forums.phpfreaks.com/topic/63932-php-coding-help/ Share on other sites More sharing options...
GingerRobot Posted August 8, 2007 Share Posted August 8, 2007 Im a little confused as to what you are trying to do. If i run this code, i get: Response:200 For http://www.google.com Response:200 For http://www.google.com/intl/en/options/ Response:200 For http://www.google.com/intl/en/ads/ Response:200 For http://www.google.com/services/ Response:200 For http://www.google.com/intl/en/about.html -------------------------------------------------------------------------------- http://www.google.com/intl/en/options/ http://www.google.com/intl/en/ads/ http://www.google.com/services/ http://www.google.com/intl/en/about.html Do you mean to say you want to retrieve all of the links from http://www.google.com/intl/en/options/ , http://www.google.com/intl/en/ads/, etc (and presumably any links found on each of those pages?) ? Quote Link to comment https://forums.phpfreaks.com/topic/63932-php-coding-help/#findComment-318660 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.