atomMan Posted January 11, 2008 Share Posted January 11, 2008 If anyone has a better way of generating an rss/xml doc from an html page, i'm all ears. This script takes an html document and generates an rss feed based on what tags you tell it to look for (i have it set to start at <h1> and end at </h1>). Sometimes it works, sometimes it doesn't. Even when it does though, it's not doing what i need to do. First problem is that i can't get it to parse more than ~20 rss items. This line affect that, but it doesn't work properly. If i increase the value too much, the script seems to fail all the time (no errors). $match_count = ($match_count > 25) ? 25 : $match_count; Also, i want it to start the item at <h1> and end at the first </p>. If </p> is not present before the next <h1>, then i want it to end at </h1>. So what's between <h1> and </h1> should be the channel and what's between <p> and </p> should be the description ( oh, and i need to be able to limit the word count in the description too). This script is not mine. I'm just trying to hack it into shape... and failing miserably :-\ <?php $pageurl = $_GET['pageurl']; if ($pageurl) { parse_html($pageurl); } else { show_form(); } function show_form() { $server = getenv("SERVER_NAME"); $request = getenv("REQUEST_URI"); ?> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=windows-1252"> <title>RSSgenr8</title> </head> <body> <form action="<? print 'http://' . $server . $request; ?>"> <input type="text" name="pageurl" size=50> <input type="submit" value="Create RSS"> Include a final "/" or a filename. </form> <ul> <li>The channel title is taken from the web page title. <li>The channel description is taken from the meta description. <li>The item text is put in the description element. <li>The first line or the first 100 characters of html stripped description are put in the title element. <li>The first link in the description is put in the link element. If there isn't one, the web page url is used. <li>Relative paths in the link url are converted to absolute paths. <li>All tags except <A> <B> <BR> <BLOCKQUOTE> <CENTER> <DD> <DL> <DT> <HR> <I> <IMG> <LI> <OL> <P> <PRE> <U> <UL> are stripped from the description. <li>Tabs, NewLines, etc, in the description are converted to a single space <li>A maximum of 25 items are included in the rss. </ul> </body> </html> <? } function parse_html($pageurl){ $itemregexp = "%<h1>(.+?)</p>%is"; $allowable_tags = "<A><B><br /><br><BLOCKQUOTE><CENTER><DD><DL><DT><HR><I><IMG><LI> <OL><P><PRE><U><UL>"; $pageurlparts = parse_url($pageurl); if ($pageurlparts[path] == "") $pageurl .= "/"; if ($fp = @fopen($pageurl, "r")) { while (!feof($fp)) { $data .= fgets($fp, 128); } fclose($fp); } // print "<pre>"; // print htmlentities($data); // eregi("<title>(.*)</title>", $data, $title); // $channel_title = $title[1]; $channel_title = ""; if (preg_match('/<title>(.+?)<\/title>/i', $data, $regs) > 0) { $channel_title = $regs[1]; } if (preg_match('/<meta .*description.*"(.+?)"/i', $data, $regs) > 0) { $channel_desc = $regs[1]; } if ($channel_desc == "") $channel_desc = $pageurl; $match_count = preg_match_all($itemregexp, $data, $items); $match_count = ($match_count > 25) ? 25 : $match_count; header("Content-Type: text/xml"); $output .= "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?>\n"; $output .= "<!-- generator=\"rssgenr8/0.92\" -->\n"; $output .= "<!DOCTYPE rss PUBLIC \"-//W3C//ENTITIES Latin 1 for XHTML//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent\">\n"; $output .= "<rss version=\"0.92\">\n"; $output .= " <channel>\n"; $output .= " <title>". htmlentities(strip_tags($channel_title)) ."</title>\n"; $output .= " <link>". htmlentities($pageurl) ."</link>\n"; $output .= " <description>". htmlentities($channel_desc) ."</description>\n"; $output .= " <webMaster>". htmlentities("webmaster") ."</webMaster>\n"; $output .= " <generator>". htmlentities("RSSgenr8 from XMLhub.com") ."</generator>\n"; $output .= " <language>en</language>\n"; for ($i=0; $i< $match_count; $i++) { $desc = $items[1][$i]; $title = wsstrip($desc); $descout = $desc; if (preg_match("/(.+?)(?:<\/P|<\/div|<br|<\/h|<\/td)/i", $title, $regs) > 0) { $title = $regs[1]; if (strlen(wsstrip(trim(strip_tags($title)))) < 100) { $descout = str_replace($title,"",$descout); } } $title = wsstrip(trim(strip_tags($title))); if (strlen($title) > 100) { $title = substr($title,0,100) . " ..."; } $item_url = get_link($desc, $pageurl); $descout = wsstrip(strip_tags($descout, $allowable_tags)); $pos = strpos($descout, "<br>"); if (is_int($pos) and ($pos == 0)) { $descout=substr($descout, 4); } $pos = strpos($descout, "<br />"); if (is_int($pos) and ($pos == 0)) { $descout=substr($descout, 6); } $descout = htmlentities(wsstrip($descout)); $output .= " <item>\n"; $output .= " <title>". htmlentities($title) ."</title>\n"; $output .= " <link>". htmlentities($item_url) ."</link>\n"; $output .= " <description>". $descout ."</description>\n"; $output .= " </item>\n"; } $output .= " </channel>\n"; $output .= "</rss>\n"; print $output; // print htmlentities($output); // print "</pre>"; } function get_link($desc, $pageurl) { if (stristr($desc, "href")) { $linkurl = stristr($desc, "href"); $linkurl = substr($linkurl, strpos($linkurl, "\"")+1); $linkurl = substr($linkurl, 0, strpos($linkurl, "\"")); $linkurl = trim($linkurl); $pageurlarray = parse_url($linkurl); if (empty($pageurlarray['host'])) { $linkurl = make_abs($linkurl, $pageurl); } return $linkurl; } else { return $pageurl; } } function wsstrip($str) { $str=ereg_replace("[\r\t\n]"," ",$str); $str=ereg_replace (' +', ' ', trim($str)); return $str; } function make_abs($rel_uri, $base, $REMOVE_LEADING_DOTS = true) { preg_match("'^([^:]+://[^/]+)/'", $base, $m); $base_start = $m[1]; if (preg_match("'^/'", $rel_uri)) { return $base_start . $rel_uri; } $base = preg_replace("{[^/]+$}", '', $base); $base .= $rel_uri; $base = preg_replace("{^[^:]+://[^/]+}", '', $base); $base_array = explode('/', $base); if (count($base_array) and!strlen($base_array[0])) array_shift($base_array); $i = 1; while ($i < count($base_array)) { if ($base_array[$i - 1] == ".") { array_splice($base_array, $i - 1, 1); if ($i > 1) $i--; } elseif ($base_array[$i] == ".." and $base_array[$i - 1]!= "..") { array_splice($base_array, $i - 1, 2); if ($i > 1) { $i--; if ($i == count($base_array)) array_push($base_array, ""); } } else { $i++; } } if (count($base_array) and $base_array[-1] == ".") $base_array[-1] = ""; if ($REMOVE_LEADING_DOTS) { while (count($base_array) and preg_match("/^\.\.?$/", $base_array[0])) { array_shift($base_array); } } return($base_start . '/' . implode("/", $base_array)); } ?> Quote Link to comment https://forums.phpfreaks.com/topic/85466-noob-needs-help-with-xmlrss-generation-script/ Share on other sites More sharing options...
10,000 BC Man Posted April 1, 2008 Share Posted April 1, 2008 My guess is you will probably find something in the PEAR packages to do this. BC_Man Quote Link to comment https://forums.phpfreaks.com/topic/85466-noob-needs-help-with-xmlrss-generation-script/#findComment-506587 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.