Jump to content

[SOLVED] Delete lines from htm file until body tag is found


rossh

Recommended Posts

Hi,

 

I've been working on a sanitize script to clean html files.  I'm trying to figure out how to delete lines from the top of the file until the <body> tag is found.  Any help would be appreciated.

 

R

 

<?php

$base_dir = 'c:\\xampp\\htdocs\\sanitize\\';

//CLEAN DATA HERE
$sanitize = array(
	' & ' => ' & ',
	'£' => '£',
	'£' => '£',
	'<b>' => '<strong>',
	'</b>' => '</strong>',
	'<i>' => '<em>',
	'</i>' => '</em>'
);

//Output the current directories content
function outputDir(){
	global $base_dir;
	$my_dir = $base_dir.$my_dir;
	$dir = opendir($my_dir) or die('Couldn\'t open directory, please contact the web administrator');

	while(($file = readdir($dir)) !== false){
		if($file != '.' && $file != '..'){
			$fname[] = $file;
		}
	}
	closedir($dir);

	foreach($fname as $file){
		if(!is_dir($file) && $file != 'sanitize.php'){
			$filenames .= $file.',';
		}
	}
	return $filenames = substr_replace($filenames ,'',-1);
}
//Read file, add title to anchor tags and output to new file
function sanitizeFile($file){
	global $base_dir, $sanitize;
	$lines = file($base_dir.$file);
	if(empty($lines)){
		echo 'File: '.$file.' empty!';
	}
	$new_lines = array();

	foreach ($lines as $line){		
		foreach($sanitize as $search => $replace){
			$line = str_replace($search, $replace, $line);
		}
		array_push ($new_lines, $line);
	}
	$content = implode('', $new_lines);
	$fp = fopen ($base_dir.'sanitized\\'.$file, w);
	fwrite ($fp, $content);
	fclose ($fp);
}
//Get filenames and sanitize
if(isset($_POST['submit'])){
	$filenames = outputDir();
	$filenames = explode(',', $filenames);

	$response .= '<ul>'."\n";
	foreach($filenames as $file){
		sanitizeFile($file);
		$response .= '<li>Sanitized: '.$file.'</li>'."\n";
	}
	$response .= '</ul>'."\n";
}
?>

<p>Base Directory: <?php echo $base_dir; ?></p>

<form name="sanitize_data" action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">
<input type="submit" name="submit" value="Sanitize!" id="submit" />
</form>

<?php echo $response; ?>

Here is one way you could do it:

 

<?php
$unsanitesedHTMLstring;//use fopen & freads to get the html page in to a string

$bits=explode("<body>", $unsanitesedHTMLstring); //explode on body to leave 2 bits
$sanitisedHTMLstring="<body>".$bits[1]; //get the final string and add the body tag

?>

The script would need tidying and error checking etc (for instance if the page has a body onload event you would need to alter it slightly)

I figured it out.

 

<?php
$base_dir = 'c:\\xampp\\htdocs\\work\\sanitize\\';

//CLEAN DATA HERE
$sanitize = array(
	' & ' => ' & ',
	'£' => '£',
	'£' => '£',
	'<b>' => '<strong>',
	'</b>' => '</strong>',
	'<i>' => '<em>',
	'</i>' => '</em>'
);

//Output the current directories content
function outputDir(){
	global $base_dir;
	$my_dir = $base_dir.$my_dir;
	$dir = opendir($my_dir) or die('Couldn\'t open directory, please contact the web administrator');

	while(($file = readdir($dir)) !== false){
		if($file != '.' && $file != '..'){
			$fname[] = $file;
		}
	}
	closedir($dir);

	foreach($fname as $file){
		if(!is_dir($file) && $file != 'sanitize.php'){
			$filenames .= $file.',';
		}
	}
	return $filenames = substr_replace($filenames ,'',-1);
}
//Read file, add title to anchor tags and output to new file
function sanitizeFile($file){
	global $base_dir, $sanitize;		
	$lines = file($base_dir.$file);
	if(empty($lines)){
		echo 'File: '.$file.' empty!';
	}
	$new_lines = array();

	foreach($lines as $line_num => $line){
		if($found = strpos($line, 'body') == TRUE){
			echo $body_tag_line = $line_num;
		}
		/*CLEAN DATA HERE*/
		$line = preg_replace('/<a[^>]*?href=[\'"](.*?)[\'"][^>]*?>(.*?)<\/a>/si','<a href="$1" title="$2">$2</a>',$line); //Add titles to anchor tags
		foreach($sanitize as $search => $replace){
			$line = str_replace($search, $replace, $line);
		}
		array_push ($new_lines, $line);
	}
	for($i=1;$i<$body_tag_line;$i++){ //
		unset($new_lines[$i]);
	}
	$content = implode('', $new_lines);
	$fp = fopen ($base_dir.'sanitized\\'.$file, w);
	fwrite ($fp, $content);
	fclose ($fp);
}
//Get filenames and sanitize
if(isset($_POST['submit'])){
	$filenames = outputDir();
	$filenames = explode(',', $filenames);

	$response .= '<ul>'."\n";
	foreach($filenames as $file){
		sanitizeFile($file);
		$response .= '<li>Sanitized: '.$file.'</li>'."\n";
	}
	$response .= '</ul>'."\n";
}
?>

<p>Base Directory: <?php echo $base_dir; ?></p>

<form name="sanitize_data" action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">
<input type="submit" name="submit" value="Sanitize!" id="submit" />
</form>

<?php echo $response; ?>

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.