Jump to content

In depth cleaning - database field


randall

Recommended Posts

 

I have been trying to hack two parts of code together... I had code writen that will grab the text from a website and completely clean it of all junk except for full words... then echo it.  Now I am trying to use the same script to pull from a database instead of a URL but am lost... Here is my code... I would make another donation to the site if we can get this going... THANK YOU!

 


<?php

$con = mysql_connect("localhost","USERNAME","PASSWORD!");
mysql_select_db("DATABASE",$con);

$get = "SELECT * FROM information_description WHERE information_id=4";
$SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error());
$fetch = mysql_fetch_array($SQ_query);

$raw = $fetch['description'];


/* Set internal character encoding to UTF-8 */
mb_internal_encoding("UTF-8");
mb_http_output( "UTF-8" );
ob_start("mb_output_handler");

function clean($html) {
###Remove number in html################
//$html  = preg_replace("/[0-9]/", " ", $html);


$html =  preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html);

//	$html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html);
echo $html;

$html = str_replace(" ", " ", $html);
$html = str_replace("&", " ", $html);
$html = str_replace("-", " ", $html);
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\s\s+/', ', ', $html);
$html =  preg_replace('/[\s\W]+/',' ',$html);   // Strip off spaces and non-alpha-numeric 
return $html;
}

#call function
//$raw = StripHtmlTags($raw);
$raw = clean($raw);
echo $raw;

##echo clean($html);

$url = (isset($_GET['url']) ?$_GET['url'] : 0);
$str = file_get_contents($url);
####################################################################3
function get_url_contents($url){
        $crl = curl_init();
        $timeout = 5;
        curl_setopt ($crl, CURLOPT_URL,$url);
        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
        $ret = curl_exec($crl);
        curl_close($crl);
        return $ret;
}
#--------------------------------------Strip html tag----------------------------------------------------
function StripHtmlTags( $text )
{
  // PHP's strip_tags() function will remove tags, but it
  // doesn't remove scripts, styles, and other unwanted
  // invisible text between tags.  Also, as a prelude to
  // tokenizing the text, we need to insure that when
  // block-level tags (such as <p> or <div>) are removed,
  // neighboring words aren't joined.
  $text = preg_replace(
    array(
      // Remove invisible content
      '@<head[^>]*?>.*?</head>@siu',
      '@<style[^>]*?>.*?</style>@siu',
      '@<script[^>]*?.*?</script>@siu',
      '@<object[^>]*?.*?</object>@siu',
      '@<embed[^>]*?.*?</embed>@siu',
      '@<applet[^>]*?.*?</applet>@siu',
      '@<noframes[^>]*?.*?</noframes>@siu',
      '@<noscript[^>]*?.*?</noscript>@siu',
      '@<noembed[^>]*?.*?</noembed>@siu',

      // Add line breaks before & after blocks
      '@<((br)|(hr))@iu',
      '@</?((address)|(blockquote)|(center)|(del))@iu',
      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
      '@</?((table)|(th)|(td)|(caption))@iu',
      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
      '@</?((frameset)|(frame)|(iframe))@iu',
    ),
    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );

  // Remove all remaining tags and comments and return.
  return strtolower( $text );
}

function RemoveComments( & $string )
{
  $string = preg_replace("%(#|;|(//)).*%","",$string);
  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead
  return $string;
}


$html = StripHtmlTags($str);

###Remove number in html################
$html  = preg_replace("/[0-9]/", " ", $html);

#replace   by ' '
$html = str_replace(" ", " ", $html);

######remove any words################

$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($remove_word as $word) {
$html = preg_replace("/\b". $word ."\b/", " ", $html);
}
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\b\s+/', ', ', $html);
$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 

#remove white space, Keep : . ( ) : &
//$html = preg_replace('/\s+/', ', ', $html);


###process#########################################################################
$array_loop = explode(",", $html);
$array_loop1 = $array_loop;
$arr_tem = array();

foreach($array_loop as $key=>$val) {
if(in_array($val, $array_loop1)) {
	if(!$arr_tem[$val]) $arr_tem[$val] = 0;
	$arr_tem[$val] += 1;

	if ( ($k = array_search($val, $array_loop1) ) !== false )
	unset($array_loop1[$k]);
}
}

arsort($arr_tem);

###echo top 20 words############################################################
echo "<h3>Top 20 words used most</h3>";
$i = 1;
foreach($arr_tem as $key=>$val) {
if($i<=20) {
	echo $i.":  ".$key." (".$val." words)<br />";
	$i++;
}else break;
}
echo "<hr />";
###print array#####################################################################
echo (implode(", ", array_keys($arr_tem)));

?>

Link to comment
https://forums.phpfreaks.com/topic/259629-in-depth-cleaning-database-field/
Share on other sites

 

 

 

 

Lets use this code example instead... not much different but it pulls info...

 

Here is the output

http://salesleadhq.com/mien/new.php

 


<?php

$con = mysql_connect("localhost","USERNAME","PASSWORD!");
mysql_select_db("DATABASE",$con);

$get = "SELECT * FROM information_description WHERE information_id=4";
$SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error());
$fetch = mysql_fetch_array($SQ_query);

$raw = $fetch['description'];


/* Set internal character encoding to UTF-8 */
mb_internal_encoding("UTF-8");
mb_http_output( "UTF-8" );
ob_start("mb_output_handler");

function clean($html) {
###Remove number in html################
//$html  = preg_replace("/[0-9]/", " ", $html);


$html =  preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html);

//	$html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html);
echo $html;

$html = str_replace(" ", " ", $html);
$html = str_replace("&", " ", $html);
$html = str_replace("-", " ", $html);
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\s\s+/', ', ', $html);
$html =  preg_replace('/[\s\W]+/',' ',$html);   // Strip off spaces and non-alpha-numeric 
return $html;
}

#call function
//$raw = StripHtmlTags($raw);
$raw = clean($raw);
echo $raw;

##echo clean($html);

$url = (isset($_GET['url']) ?$_GET['url'] : 0);
$str = file_get_contents($url);
####################################################################3
function get_url_contents($url){
        $crl = curl_init();
        $timeout = 5;
        curl_setopt ($crl, CURLOPT_URL,$url);
        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
        $ret = curl_exec($crl);
        curl_close($crl);
        return $ret;
}
#--------------------------------------Strip html tag----------------------------------------------------
function StripHtmlTags( $text )
{
  // PHP's strip_tags() function will remove tags, but it
  // doesn't remove scripts, styles, and other unwanted
  // invisible text between tags.  Also, as a prelude to
  // tokenizing the text, we need to insure that when
  // block-level tags (such as <p> or <div>) are removed,
  // neighboring words aren't joined.
  $text = preg_replace(
    array(
      // Remove invisible content
      '@<head[^>]*?>.*?</head>@siu',
      '@<style[^>]*?>.*?</style>@siu',
      '@<script[^>]*?.*?</script>@siu',
      '@<object[^>]*?.*?</object>@siu',
      '@<embed[^>]*?.*?</embed>@siu',
      '@<applet[^>]*?.*?</applet>@siu',
      '@<noframes[^>]*?.*?</noframes>@siu',
      '@<noscript[^>]*?.*?</noscript>@siu',
      '@<noembed[^>]*?.*?</noembed>@siu',

      // Add line breaks before & after blocks
      '@<((br)|(hr))@iu',
      '@</?((address)|(blockquote)|(center)|(del))@iu',
      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
      '@</?((table)|(th)|(td)|(caption))@iu',
      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
      '@</?((frameset)|(frame)|(iframe))@iu',
    ),
    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );

  // Remove all remaining tags and comments and return.
  return strtolower( $text );
}

function RemoveComments( & $string )
{
  $string = preg_replace("%(#|;|(//)).*%","",$string);
  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead
  return $string;
}


$html = StripHtmlTags($str);

###Remove number in html################
$html  = preg_replace("/[0-9]/", " ", $html);

#replace   by ' '
$html = str_replace(" ", " ", $html);

######remove any words################

$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($remove_word as $word) {
$html = preg_replace("/\b". $word ."\b/", " ", $html);
}
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\b\s+/', ', ', $html);
$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 

#remove white space, Keep : . ( ) : &
//$html = preg_replace('/\s+/', ', ', $html);


###process#########################################################################
$array_loop = explode(",", $html);
$array_loop1 = $array_loop;
$arr_tem = array();

foreach($array_loop as $key=>$val) {
if(in_array($val, $array_loop1)) {
	if(!$arr_tem[$val]) $arr_tem[$val] = 0;
	$arr_tem[$val] += 1;

	if ( ($k = array_search($val, $array_loop1) ) !== false )
	unset($array_loop1[$k]);
}
}

arsort($arr_tem);

###echo top 20 words############################################################
echo "<h3>Top 20 words used most</h3>";
$i = 1;
foreach($arr_tem as $key=>$val) {
if($i<=20) {
	echo $i.":  ".$key." (".$val." words)<br />";
	$i++;
}else break;
}
echo "<hr />";
###print array#####################################################################
echo (implode(", ", array_keys($arr_tem)));



?>


 

 

 

Anyone want $20.00? I just want it to work so I can move on...

 

This works without database... it pulls from the url

http://salesleadhq.com/tools/crawler/meta.php?url=http://www.boormanarchery.com

 

The complete code for that file is below this message. It works perfectly...

 

I thought I was over complicating it as well and spent quite a while figuring this out... I tried "strip_tags()" and it left me with a bunch of non human readable text. All I want left over are full english words, (not leftovers) and I want to echo those full words... I want the words that apear more often to be listed in order. I dont know what I am doing at this point... I have been trying to get it done for a few weeks.

 

I have a text file that contains words that I want to omitt as well.

Please feel sorry for me.  :)

 

thanks in advance!

 

<?php




$url = (isset($_GET['url']) ?$_GET['url'] : 0);

$str = file_get_contents($url);

####################################################################3

function get_url_contents($url){

        $crl = curl_init();

        $timeout = 5;

        curl_setopt ($crl, CURLOPT_URL,$url);

        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);

        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);

        $ret = curl_exec($crl);

        curl_close($crl);

        return $ret;

}

#--------------------------------------Strip html tag----------------------------------------------------

function StripHtmlTags( $text )

{

  // PHP's strip_tags() function will remove tags, but it

  // doesn't remove scripts, styles, and other unwanted

  // invisible text between tags.  Also, as a prelude to

  // tokenizing the text, we need to insure that when

  // block-level tags (such as <p> or <div>) are removed,

  // neighboring words aren't joined.

  $text = preg_replace(

    array(

      // Remove invisible content

      '@<head[^>]*?>.*?</head>@siu',

      '@<style[^>]*?>.*?</style>@siu',

      '@<script[^>]*?.*?</script>@siu',

      '@<object[^>]*?.*?</object>@siu',

      '@<embed[^>]*?.*?</embed>@siu',

      '@<applet[^>]*?.*?</applet>@siu',

      '@<noframes[^>]*?.*?</noframes>@siu',

      '@<noscript[^>]*?.*?</noscript>@siu',

      '@<noembed[^>]*?.*?</noembed>@siu',



      // Add line breaks before & after blocks

      '@<((br)|(hr))@iu',

      '@</?((address)|(blockquote)|(center)|(del))@iu',

      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',

      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',

      '@</?((table)|(th)|(td)|(caption))@iu',

      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',

      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',

      '@</?((frameset)|(frame)|(iframe))@iu',

    ),

    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );



  // Remove all remaining tags and comments and return.

  return strtolower( $text );

}



function RemoveComments( & $string )

{

  $string = preg_replace("%(#|;|(//)).*%","",$string);

  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead

  return $string;

}





$html = StripHtmlTags($str);



###Remove number in html################

$html  = preg_replace("/[0-9]/", " ", $html);



#replace   by ' '

$html = str_replace(" ", " ", $html);



######remove any words################



$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);

foreach($remove_word as $word) {

$html = preg_replace("/\b". $word ."\b/", " ", $html);

}

######remove space

$html =  preg_replace ('/<[^>]*>/', '', $html);



$html =  preg_replace('/\b\s+/', ', ', $html);

$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 



#remove white space, Keep : . ( ) : &

//$html = preg_replace('/\s+/', ', ', $html);





###process#########################################################################

$array_loop = explode(",", $html);

$array_loop1 = $array_loop;

$arr_tem = array();



foreach($array_loop as $key=>$val) {

if(in_array($val, $array_loop1)) {

	if(!$arr_tem[$val]) $arr_tem[$val] = 0;

	$arr_tem[$val] += 1;



	if ( ($k = array_search($val, $array_loop1) ) !== false )

	unset($array_loop1[$k]);

}

}



arsort($arr_tem);



###echo top 20 words############################################################

echo "<h3>Top 20 words used most</h3>";

$i = 1;

foreach($arr_tem as $key=>$val) {

if($i<=20) {

	echo $i.":  ".$key." (".$val." words)<br />";

	$i++;

}else break;

}

echo "<hr />";

###print array#####################################################################

echo (implode(", ", array_keys($arr_tem)));



?>

 

 

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.