Jump to content

Relative to Exact Links


The Little Guy

Recommended Posts

I am trying to convert a link such as:

 

../somedir/somefile.html

 

to a URI like:

 

http://somesite.com/root/somedir/somefile.html

 

 

My code browses a page, and gets all the links, well as we all know links are not always formatted like the way above, they can be absolute, or relativeas well. I need to make my links that are gathered from all page to be EXACT links, such as: http://somesite.com/root/somedir/somefile.html

 

I am not 100% sure of where to begin on this...  I think I can get the absolute links, that seems easy, but the relative links seem a little harder to do.

 

Any Ideas of how I could do this?

Link to comment
https://forums.phpfreaks.com/topic/138744-relative-to-exact-links/
Share on other sites

So far it looks like this:

 

<?php
include '/home/ryannaddy/dudeel.com/incl/includes.php';
// create a new cURL resource
$ch = curl_init();

$url = 'http://google.com';

// set URL and other appropriate options
curl_setopt($ch, CURLOPT_URL, "http://google.com/");
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);


// grab URL and pass it to the browser
$opt = curl_exec($ch);
$reURL = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);

$opt = preg_replace('~<(style|script)[^>]*>.*?</1>~s',' ',$opt);
$opt = preg_replace("~( | | )~s",' ',$opt);
$opt = preg_replace("/ss+/",' ',$opt);

if(preg_match_all('~href="(.+?)"~',$opt,$matches)){
   foreach($matches[1] as $match){
      if(!preg_match("~^http~",$match)){
         //chdir($match);
         echo str_replace('//','/',$reURL.dirname($match))." ";
      }elseif(!preg_match("~^..~",$match)){
         echo 'here'." ";
      }else{
         echo $match." ";
      }
      //mysql_query(sprintf());
   }
}


//echo strip_tags($opt);


// close cURL resource, and free up system resources
curl_close($ch);
//echo " ";
?>

 

I should mention that this is going to run as a cron, and not in a browser.

how about:

<?php
include '/home/ryannaddy/dudeel.com/incl/includes.php';
// create a new cURL resource
$ch = curl_init();

$url = 'http://www.cnn.com';

// set URL and other appropriate options
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);


// grab URL and pass it to the browser
$opt = curl_exec($ch);
$reURL = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);

$opt = preg_replace('~<(style|script)[^>]*>.*?</>~s',' ',$opt);
$opt = preg_replace("~( | | )~s",' ',$opt);
$opt = preg_replace("/ss+/",' ',$opt);

if(preg_match_all('~href="(.+?)"~',$opt,$matches)){
   foreach($matches[1] as $match){
      if(preg_match("~^javascript~",$match))
        continue; //Skip JavaScript
      if(preg_match("~^(http|ftp)~",$match)){
        $href = $match;
      }else{
         $href = $reURL.$match;
      }
      echo "<a href=\"$href\">$href</a><br>";
      //mysql_query(sprintf());
   }
}


//echo strip_tags($opt);


// close cURL resource, and free up system resources
curl_close($ch);
//echo " ";
?>

I noticed your page had some ../ in it, so I tested it, and got this value:

 

http://vectorloft.com/main/index.html../stylesheet.css
http://vectorloft.com/main/index.html../index.html
http://vectorloft.com/main/index.htmlthecrew.html
http://vectorloft.com/main/index.htmlwhoweare.html
http://vectorloft.com/main/index.html../music
http://vectorloft.com/main/index.htmlportfolio.html
http://vectorloft.com/main/index.htmlcontactus.html
http://vectorloft.com/main/index.html../blog

for that you need to use some more code:

 

<?php
include '/home/ryannaddy/dudeel.com/incl/includes.php';
// create a new cURL resource
$ch = curl_init();

$url = 'http://vectorloft.com/main/index.html';

// set URL and other appropriate options
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);


// grab URL and pass it to the browser
$opt = curl_exec($ch);
$parts = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
$parts['path'] = dirname($parts['path']);
$reURL = $parts['scheme'].'://'.$parts['host'].$parts['path'];

//$opt = preg_replace('~<(style|script)[^>]*>.*?</>~s',' ',$opt);
//$opt = preg_replace("~( | | )~s",' ',$opt);
//$opt = preg_replace("/ss+/",' ',$opt);

if(preg_match_all('~href="(.+?)"~',$opt,$matches)){
   foreach($matches[1] as $match){
      if(preg_match("~^javascript~",$match))
        continue; //Skip JavaScript
      if(preg_match("~^(http|ftp)~",$match)){
        $href = $match;
      }else{
         $href = $reURL.'/'.$match;
      }
      echo "<a href=\"$href\">$href</a><br>";
      //mysql_query(sprintf());
   }
}


//echo strip_tags($opt);


// close cURL resource, and free up system resources
curl_close($ch);
//echo " ";
?>

Alright! thanks that works. Will that work if there is more than one like this: ../../ do you know?

 

This is the code:

<?php
include '/home/ryannaddy/dudeel.com/incl/includes.php';
// create a new cURL resource
$ch = curl_init();

$url = 'http://vectorloft.com/main/index.html';
//$url = 'http://dudeel.com/add?a=site';
// set URL and other appropriate options
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);

// grab URL and pass it to the browser
$opt = curl_exec($ch);
$parts = parse_url(curl_getinfo($ch, CURLINFO_EFFECTIVE_URL));
$parts['path'] = dirname($parts['path']);
$reURL = $parts['scheme'].'://'.$parts['host'].$parts['path'].'/';
//echo $reURL;
$opt = preg_replace('~<(style|script)[^>]*>.*?</\1>~s',' ',$opt);
$opt = preg_replace('~<link[^>]*>~s',' ',$opt);
$opt = preg_replace("~(\n|\r| )~s",' ',$opt);
$opt = preg_replace("/\s\s+/",' ',$opt);

if(preg_match_all('~href="(.+?)"~',$opt,$matches)){
foreach($matches[1] as $match){
	if(preg_match("~^javascript~",$match))
		continue; //Skip JavaScript
	if(preg_match("~^(http|ftp)~",$match)){
		$href = $match;
	}else{
		$href = $reURL.preg_replace('~^/~','',str_replace('..','',$match));
	}
	echo "$href\n";
	//mysql_query(sprintf());
}
}

// close cURL resource, and free up system resources
curl_close($ch);
//echo "\n";
?>

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.