randall Posted March 24, 2012 Share Posted March 24, 2012 I have been trying to hack two parts of code together... I had code writen that will grab the text from a website and completely clean it of all junk except for full words... then echo it. Now I am trying to use the same script to pull from a database instead of a URL but am lost... Here is my code... I would make another donation to the site if we can get this going... THANK YOU! <?php $con = mysql_connect("localhost","USERNAME","PASSWORD!"); mysql_select_db("DATABASE",$con); $get = "SELECT * FROM information_description WHERE information_id=4"; $SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error()); $fetch = mysql_fetch_array($SQ_query); $raw = $fetch['description']; /* Set internal character encoding to UTF-8 */ mb_internal_encoding("UTF-8"); mb_http_output( "UTF-8" ); ob_start("mb_output_handler"); function clean($html) { ###Remove number in html################ //$html = preg_replace("/[0-9]/", " ", $html); $html = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html); // $html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html); echo $html; $html = str_replace(" ", " ", $html); $html = str_replace("&", " ", $html); $html = str_replace("-", " ", $html); ######remove space $html = preg_replace ('/<[^>]*>/', '', $html); $html = preg_replace('/\s\s+/', ', ', $html); $html = preg_replace('/[\s\W]+/',' ',$html); // Strip off spaces and non-alpha-numeric return $html; } #call function //$raw = StripHtmlTags($raw); $raw = clean($raw); echo $raw; ##echo clean($html); $url = (isset($_GET['url']) ?$_GET['url'] : 0); $str = file_get_contents($url); ####################################################################3 function get_url_contents($url){ $crl = curl_init(); $timeout = 5; curl_setopt ($crl, CURLOPT_URL,$url); curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout); $ret = curl_exec($crl); curl_close($crl); return $ret; } #--------------------------------------Strip html tag---------------------------------------------------- function StripHtmlTags( $text ) { // PHP's strip_tags() function will remove tags, but it // doesn't remove scripts, styles, and other unwanted // invisible text between tags. Also, as a prelude to // tokenizing the text, we need to insure that when // block-level tags (such as <p> or <div>) are removed, // neighboring words aren't joined. $text = preg_replace( array( // Remove invisible content '@<head[^>]*?>.*?</head>@siu', '@<style[^>]*?>.*?</style>@siu', '@<script[^>]*?.*?</script>@siu', '@<object[^>]*?.*?</object>@siu', '@<embed[^>]*?.*?</embed>@siu', '@<applet[^>]*?.*?</applet>@siu', '@<noframes[^>]*?.*?</noframes>@siu', '@<noscript[^>]*?.*?</noscript>@siu', '@<noembed[^>]*?.*?</noembed>@siu', // Add line breaks before & after blocks '@<((br)|(hr))@iu', '@</?((address)|(blockquote)|(center)|(del))@iu', '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', '@</?((table)|(th)|(td)|(caption))@iu', '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', '@</?((frameset)|(frame)|(iframe))@iu', ), array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text ); // Remove all remaining tags and comments and return. return strtolower( $text ); } function RemoveComments( & $string ) { $string = preg_replace("%(#|;|(//)).*%","",$string); $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead return $string; } $html = StripHtmlTags($str); ###Remove number in html################ $html = preg_replace("/[0-9]/", " ", $html); #replace by ' ' $html = str_replace(" ", " ", $html); ######remove any words################ $remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); foreach($remove_word as $word) { $html = preg_replace("/\b". $word ."\b/", " ", $html); } ######remove space $html = preg_replace ('/<[^>]*>/', '', $html); $html = preg_replace('/\b\s+/', ', ', $html); $html = preg_replace('/[\b\W]+/',', ',$html); // Strip off spaces and non-alpha-numeric #remove white space, Keep : . ( ) : & //$html = preg_replace('/\s+/', ', ', $html); ###process######################################################################### $array_loop = explode(",", $html); $array_loop1 = $array_loop; $arr_tem = array(); foreach($array_loop as $key=>$val) { if(in_array($val, $array_loop1)) { if(!$arr_tem[$val]) $arr_tem[$val] = 0; $arr_tem[$val] += 1; if ( ($k = array_search($val, $array_loop1) ) !== false ) unset($array_loop1[$k]); } } arsort($arr_tem); ###echo top 20 words############################################################ echo "<h3>Top 20 words used most</h3>"; $i = 1; foreach($arr_tem as $key=>$val) { if($i<=20) { echo $i.": ".$key." (".$val." words)<br />"; $i++; }else break; } echo "<hr />"; ###print array##################################################################### echo (implode(", ", array_keys($arr_tem))); ?> Quote Link to comment https://forums.phpfreaks.com/topic/259629-in-depth-cleaning-database-field/ Share on other sites More sharing options...
AyKay47 Posted March 24, 2012 Share Posted March 24, 2012 It would help if you showed the actual results after the "clean" function is ran on $raw. Quote Link to comment https://forums.phpfreaks.com/topic/259629-in-depth-cleaning-database-field/#findComment-1330745 Share on other sites More sharing options...
randall Posted March 24, 2012 Author Share Posted March 24, 2012 Lets use this code example instead... not much different but it pulls info... Here is the output http://salesleadhq.com/mien/new.php <?php $con = mysql_connect("localhost","USERNAME","PASSWORD!"); mysql_select_db("DATABASE",$con); $get = "SELECT * FROM information_description WHERE information_id=4"; $SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error()); $fetch = mysql_fetch_array($SQ_query); $raw = $fetch['description']; /* Set internal character encoding to UTF-8 */ mb_internal_encoding("UTF-8"); mb_http_output( "UTF-8" ); ob_start("mb_output_handler"); function clean($html) { ###Remove number in html################ //$html = preg_replace("/[0-9]/", " ", $html); $html = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html); // $html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html); echo $html; $html = str_replace(" ", " ", $html); $html = str_replace("&", " ", $html); $html = str_replace("-", " ", $html); ######remove space $html = preg_replace ('/<[^>]*>/', '', $html); $html = preg_replace('/\s\s+/', ', ', $html); $html = preg_replace('/[\s\W]+/',' ',$html); // Strip off spaces and non-alpha-numeric return $html; } #call function //$raw = StripHtmlTags($raw); $raw = clean($raw); echo $raw; ##echo clean($html); $url = (isset($_GET['url']) ?$_GET['url'] : 0); $str = file_get_contents($url); ####################################################################3 function get_url_contents($url){ $crl = curl_init(); $timeout = 5; curl_setopt ($crl, CURLOPT_URL,$url); curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout); $ret = curl_exec($crl); curl_close($crl); return $ret; } #--------------------------------------Strip html tag---------------------------------------------------- function StripHtmlTags( $text ) { // PHP's strip_tags() function will remove tags, but it // doesn't remove scripts, styles, and other unwanted // invisible text between tags. Also, as a prelude to // tokenizing the text, we need to insure that when // block-level tags (such as <p> or <div>) are removed, // neighboring words aren't joined. $text = preg_replace( array( // Remove invisible content '@<head[^>]*?>.*?</head>@siu', '@<style[^>]*?>.*?</style>@siu', '@<script[^>]*?.*?</script>@siu', '@<object[^>]*?.*?</object>@siu', '@<embed[^>]*?.*?</embed>@siu', '@<applet[^>]*?.*?</applet>@siu', '@<noframes[^>]*?.*?</noframes>@siu', '@<noscript[^>]*?.*?</noscript>@siu', '@<noembed[^>]*?.*?</noembed>@siu', // Add line breaks before & after blocks '@<((br)|(hr))@iu', '@</?((address)|(blockquote)|(center)|(del))@iu', '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', '@</?((table)|(th)|(td)|(caption))@iu', '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', '@</?((frameset)|(frame)|(iframe))@iu', ), array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text ); // Remove all remaining tags and comments and return. return strtolower( $text ); } function RemoveComments( & $string ) { $string = preg_replace("%(#|;|(//)).*%","",$string); $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead return $string; } $html = StripHtmlTags($str); ###Remove number in html################ $html = preg_replace("/[0-9]/", " ", $html); #replace by ' ' $html = str_replace(" ", " ", $html); ######remove any words################ $remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); foreach($remove_word as $word) { $html = preg_replace("/\b". $word ."\b/", " ", $html); } ######remove space $html = preg_replace ('/<[^>]*>/', '', $html); $html = preg_replace('/\b\s+/', ', ', $html); $html = preg_replace('/[\b\W]+/',', ',$html); // Strip off spaces and non-alpha-numeric #remove white space, Keep : . ( ) : & //$html = preg_replace('/\s+/', ', ', $html); ###process######################################################################### $array_loop = explode(",", $html); $array_loop1 = $array_loop; $arr_tem = array(); foreach($array_loop as $key=>$val) { if(in_array($val, $array_loop1)) { if(!$arr_tem[$val]) $arr_tem[$val] = 0; $arr_tem[$val] += 1; if ( ($k = array_search($val, $array_loop1) ) !== false ) unset($array_loop1[$k]); } } arsort($arr_tem); ###echo top 20 words############################################################ echo "<h3>Top 20 words used most</h3>"; $i = 1; foreach($arr_tem as $key=>$val) { if($i<=20) { echo $i.": ".$key." (".$val." words)<br />"; $i++; }else break; } echo "<hr />"; ###print array##################################################################### echo (implode(", ", array_keys($arr_tem))); ?> Quote Link to comment https://forums.phpfreaks.com/topic/259629-in-depth-cleaning-database-field/#findComment-1330754 Share on other sites More sharing options...
AyKay47 Posted March 24, 2012 Share Posted March 24, 2012 I think you are over complicating this. First, to remove any tags found in the string, you can simply use strip_tags which will leav you with the content inside of the tags. Then you can do any other sanitization you want. Quote Link to comment https://forums.phpfreaks.com/topic/259629-in-depth-cleaning-database-field/#findComment-1330760 Share on other sites More sharing options...
randall Posted March 24, 2012 Author Share Posted March 24, 2012 Anyone want $20.00? I just want it to work so I can move on... This works without database... it pulls from the url http://salesleadhq.com/tools/crawler/meta.php?url=http://www.boormanarchery.com The complete code for that file is below this message. It works perfectly... I thought I was over complicating it as well and spent quite a while figuring this out... I tried "strip_tags()" and it left me with a bunch of non human readable text. All I want left over are full english words, (not leftovers) and I want to echo those full words... I want the words that apear more often to be listed in order. I dont know what I am doing at this point... I have been trying to get it done for a few weeks. I have a text file that contains words that I want to omitt as well. Please feel sorry for me. thanks in advance! <?php $url = (isset($_GET['url']) ?$_GET['url'] : 0); $str = file_get_contents($url); ####################################################################3 function get_url_contents($url){ $crl = curl_init(); $timeout = 5; curl_setopt ($crl, CURLOPT_URL,$url); curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout); $ret = curl_exec($crl); curl_close($crl); return $ret; } #--------------------------------------Strip html tag---------------------------------------------------- function StripHtmlTags( $text ) { // PHP's strip_tags() function will remove tags, but it // doesn't remove scripts, styles, and other unwanted // invisible text between tags. Also, as a prelude to // tokenizing the text, we need to insure that when // block-level tags (such as <p> or <div>) are removed, // neighboring words aren't joined. $text = preg_replace( array( // Remove invisible content '@<head[^>]*?>.*?</head>@siu', '@<style[^>]*?>.*?</style>@siu', '@<script[^>]*?.*?</script>@siu', '@<object[^>]*?.*?</object>@siu', '@<embed[^>]*?.*?</embed>@siu', '@<applet[^>]*?.*?</applet>@siu', '@<noframes[^>]*?.*?</noframes>@siu', '@<noscript[^>]*?.*?</noscript>@siu', '@<noembed[^>]*?.*?</noembed>@siu', // Add line breaks before & after blocks '@<((br)|(hr))@iu', '@</?((address)|(blockquote)|(center)|(del))@iu', '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', '@</?((table)|(th)|(td)|(caption))@iu', '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', '@</?((frameset)|(frame)|(iframe))@iu', ), array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text ); // Remove all remaining tags and comments and return. return strtolower( $text ); } function RemoveComments( & $string ) { $string = preg_replace("%(#|;|(//)).*%","",$string); $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead return $string; } $html = StripHtmlTags($str); ###Remove number in html################ $html = preg_replace("/[0-9]/", " ", $html); #replace by ' ' $html = str_replace(" ", " ", $html); ######remove any words################ $remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); foreach($remove_word as $word) { $html = preg_replace("/\b". $word ."\b/", " ", $html); } ######remove space $html = preg_replace ('/<[^>]*>/', '', $html); $html = preg_replace('/\b\s+/', ', ', $html); $html = preg_replace('/[\b\W]+/',', ',$html); // Strip off spaces and non-alpha-numeric #remove white space, Keep : . ( ) : & //$html = preg_replace('/\s+/', ', ', $html); ###process######################################################################### $array_loop = explode(",", $html); $array_loop1 = $array_loop; $arr_tem = array(); foreach($array_loop as $key=>$val) { if(in_array($val, $array_loop1)) { if(!$arr_tem[$val]) $arr_tem[$val] = 0; $arr_tem[$val] += 1; if ( ($k = array_search($val, $array_loop1) ) !== false ) unset($array_loop1[$k]); } } arsort($arr_tem); ###echo top 20 words############################################################ echo "<h3>Top 20 words used most</h3>"; $i = 1; foreach($arr_tem as $key=>$val) { if($i<=20) { echo $i.": ".$key." (".$val." words)<br />"; $i++; }else break; } echo "<hr />"; ###print array##################################################################### echo (implode(", ", array_keys($arr_tem))); ?> Quote Link to comment https://forums.phpfreaks.com/topic/259629-in-depth-cleaning-database-field/#findComment-1330815 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.