d.shankar Posted August 8, 2007 Share Posted August 8, 2007 Hello all ! I have written a code such that given a webaddress it will find all the links in that page.. The problem is the code is unable to find the nested links. Example: I provide the site www.google.com my code retrieves these files.. index.html about.html My query to you all is that , the code is unable to find the nested links under index.html and about.html here is my code <?php $domain="http://www.google.com"; $links = getLinks($domain,$domain); if(!empty($links->url)) { $arr_urls1=implode(",",$links->url); $durls1=explode(",",$arr_urls1); $xurlsx=array_unique($durls1); $aurls=array(); $i=0; // copying non duplicate entries to a new array to avoid offset error foreach($xurlsx as $strItem){ $aurls[$i]=$strItem; $i++; } // Number of url links //echo "number of url links = " . count( $aurls ) . "<br><br>"; // this loop is used to find the sub links from the found pages. but i cant achieve for($i=0;$i<count($aurls);$i++) { $aurls_len=count($aurls); //echo $aurls[$i]; //echo "<br>"; flush(); //echo "sending...".$aurls[$i]."<br>"; getLinks($aurls[$i],$domain); if(!empty($links->url)) { $arr_urls=implode(",",$links->url); $xurls=explode(",",$arr_urls);//to array //$xurls2=array_unique($temp_urls); /* FYI echo "<hr>"; flush(); for($j=0;$j<count($xurls);$j++) { echo $xurls[$j]; echo "<br>"; flush(); } echo "<hr>"; flush(); */ for($j=0;$j<count($xurls);$j++) { if(in_array($xurls[$j],$aurls)) { //echo "found"; //flush(); } else { //echo "not found"; //flush(); $aurls[$aurls_len]=$xurls[$j]; $aurls_len++; } } unset($xurls); unset($arr_urls); } } echo "<hr>"; for($i=0;$i<count($aurls);$i++) { echo $aurls[$i]; echo "<br>"; } echo "<hr>"; } /* for($i=0;$i<count($xurls);$i++) { echo $xurls[$i]; echo "<br>"; } */ //print_r(array_values($aurls)); //echo implode("<br>",$links0->javascript); //curl_close($ch); function getLinks($url,$host) { $ch = curl_init(); curl_setopt($ch,CURLOPT_URL, $url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch,CURLOPT_FAILONERROR,true); $source = curl_exec($ch); $httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE); echo "<br>Response:".$httpCode." For ".$url."<br>"; curl_close($ch); preg_match_all("/<a (?:.*?)href=\"([^\"]+?)\"(?:[^>]*?)>/si", $source, $result ); //$links->email = array(); //$links->javascript = array(); $links->url = array(); if($httpCode==0 || $httpCode==404 || $httpCode==500 || $httpCode==400) { $links->url[]=""; //break; } else { foreach( $result[1] as $value ) { if( strtolower( substr( $value, 0, 7 ) ) == "mailto:" ) { // $links->email[] = substr( $value, 7 ); } else if( strtolower( substr( $value, 0, 11 ) ) == "javascript:" ) { // $links->javascript[] = substr( $value, 11 ); } else { //index.html,www.xyz.com if(stripos($value,$host)===true) { //$value=$host."/".$value; $links->url[] = $value; } else { if(stripos($value,"http://")===false) { if(substr($value,0,1)=="/") { //substr_replace($value," ",0); $value=$host.$value; $links->url[] = $value; } else { $value=$host."/".$value; $links->url[] = $value; } } } }//else //else }//for }//else return $links; //flush(); }//func ?> Please check the code .. It does not have any errors but it fails to find further links. Link to comment https://forums.phpfreaks.com/topic/63932-php-coding-help/ Share on other sites More sharing options...
GingerRobot Posted August 8, 2007 Share Posted August 8, 2007 Im a little confused as to what you are trying to do. If i run this code, i get: Response:200 For http://www.google.com Response:200 For http://www.google.com/intl/en/options/ Response:200 For http://www.google.com/intl/en/ads/ Response:200 For http://www.google.com/services/ Response:200 For http://www.google.com/intl/en/about.html -------------------------------------------------------------------------------- http://www.google.com/intl/en/options/ http://www.google.com/intl/en/ads/ http://www.google.com/services/ http://www.google.com/intl/en/about.html Do you mean to say you want to retrieve all of the links from http://www.google.com/intl/en/options/ , http://www.google.com/intl/en/ads/, etc (and presumably any links found on each of those pages?) ? Link to comment https://forums.phpfreaks.com/topic/63932-php-coding-help/#findComment-318660 Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.