Jump to content

preg_match basics and filtering


dflow

Recommended Posts

i want to parse all the urls from a page using preg_match

 

i get the results

i want to isolate so i have a distinct result

so if there is more than one link to the same url

i only want to display i result in the array

 

i want to find  all urls with this pattern:

"products/productpage1.htm"

 

im using this code


<?php

$data = file_get_contents("http://example.com");
$pattern = "/href=[\"']?([^\"']?.*(htm))[\"']?/i";
preg_match_all($pattern, $data, $urls);

print_r($urls)
?>

 

Link to comment
https://forums.phpfreaks.com/topic/221333-preg_match-basics-and-filtering/
Share on other sites

Sorry if it is a bit shabby, I'm not the best on regex, and feel free to change it, I pretty much recommend it if you can.

<?php
$site='siteurl';
if($sitecont=@file_get_contents($site)){
$links=array();
$templinks=array();
$templink=array();
preg_match_all('/href="[\S]{1,50}"/',$sitecont,$templinks);
foreach($templinks AS $templink){
	foreach($templink AS $tlink){
		$links[]=substr($tlink,6,strlen($tlink)-7);
	}
}
foreach($links AS $link){
	echo $link.'<br />';
}
}
?>

<?php
$site='http://localhost';
$domain=$_SERVER['HTTP_HOST'];
if($sitecont=@file_get_contents($site)){
$templinks=array();
$templink=array();
$links=array();
preg_match_all('/href=".*?"/',$sitecont,$templinks);
foreach($templinks AS $templink){
	foreach($templink AS $tlink){
		$tlink=preg_replace('/'.$domain.'/','',$tlink);
		$links[]=substr($tlink,6,strlen($tlink)-7);
	}
}
foreach($links AS $link){
	echo $link.'<br />';
}
}
?>

 

Added some more functionality.

<?php
$site='http://localhost';
$domain=$_SERVER['HTTP_HOST'];
if($sitecont=@file_get_contents($site)){
preg_match_all('/href=".*?"/',$sitecont,$templinks);
$links=array();
foreach($templinks AS $templink){
	foreach($templink AS $tlink){
		$tlink=preg_replace('/'.$domain.'/','',$tlink);
		$nlink=substr($tlink,6,strlen($tlink)-7);
		$dupelink=0;
		foreach($links AS $ulink){
			if($nlink==$ulink){
				$dupelink=1;
			}
		}
		if($dupelink==0){
			$links[]=$nlink;
		}
	}
}
foreach($links AS $link){
	echo $link.'<br />';
}
}
?>

 

Now it should only echo/store a link once. (only unique links)

<?php
$site='http://localhost';
$domain=$_SERVER['HTTP_HOST'];
if($sitecont=@file_get_contents($site)){
preg_match_all('/href=".*?"/',$sitecont,$templinks);
$links=array();
foreach($templinks AS $templink){
	foreach($templink AS $tlink){
		$tlink=preg_replace('/'.$domain.'/','',$tlink);
		$nlink=substr($tlink,6,strlen($tlink)-7);
		$dupelink=0;
		foreach($links AS $ulink){
			if($nlink==$ulink){
				$dupelink=1;
			}
		}
		if($dupelink==0){
			$links[]=$nlink;
		}
	}
}
foreach($links AS $link){
	echo $link.'<br />';
}
}
?>

 

Now it should only echo/store a link once. (only unique links)

 

for some reason it still outputs 3 links

 

now how would i loop through each result and parse each webpage ??

foreach($links AS $link){

foreach($link->find('span[id=apartmentname]') as $partmentname)
    echo $partmentname->plaintext.'<br><br>';

}
}

the foreach isnt correct how should it be to lop through the resulted array??

i know im making a mess but new to looping arrays

thanks

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.