Jump to content

Recommended Posts

If anyone has a better way of generating an rss/xml doc from an html page, i'm all ears.

 

This script takes an html document and generates an rss feed based on what tags you tell it to look for (i have it set to start at <h1> and end at </h1>).

 

Sometimes it works, sometimes it doesn't. Even when it does though, it's not doing what i need to do.

 

First problem is that i can't get it to parse more than ~20 rss items. This line affect that, but it doesn't work properly. If i increase the value too much, the script seems to fail all the time (no errors).

 

$match_count = ($match_count > 25) ? 25 : $match_count;

 

Also, i want it to start the item at <h1> and end at the first </p>. If </p> is not present before the next <h1>, then i want it to end at </h1>. So what's between <h1> and </h1> should be the channel and what's between <p> and </p> should be the description ( oh, and i need to be able to limit the word count in the description too).

 

This script is not mine. I'm just trying to hack it into shape... and failing miserably  :-\

 

<?php
$pageurl = $_GET['pageurl'];
if ($pageurl) {
  parse_html($pageurl);
} else {
  show_form();
}

function show_form() {
  $server = getenv("SERVER_NAME");
  $request = getenv("REQUEST_URI");
?>
<html>

<head>
<meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
<title>RSSgenr8</title>
</head>
<body>
<form action="<? print 'http://' . $server . $request; ?>">
  <input type="text" name="pageurl" size=50> 
  <input type="submit" value="Create RSS">
  Include a final "/" or a filename.
</form>
<ul>
   <li>The channel title is taken from the web page title.
   <li>The channel description is taken from the meta description.
   <li>The item text is put in the description element.
   <li>The first line or the first 100 characters of html stripped description are put in the title element.
   <li>The first link in the description is put in the link element. If there isn't one, the web page url is used.
   <li>Relative paths in the link url are converted to absolute paths.
   <li>All tags except <A> <B> <BR> <BLOCKQUOTE> <CENTER> <DD> <DL> <DT> <HR> <I> <IMG> <LI> <OL> <P> <PRE> <U> <UL> are stripped from the description.
   <li>Tabs, NewLines, etc, in the description are converted to a single space
   <li>A maximum of 25 items are included in the rss.
</ul>
</body>
</html>   
<?  
}

function parse_html($pageurl){
  $itemregexp = "%<h1>(.+?)</p>%is";
  $allowable_tags = "<A><B><br /><br><BLOCKQUOTE><CENTER><DD><DL><DT><HR><I><IMG><LI> <OL><P><PRE><U><UL>";

  $pageurlparts = parse_url($pageurl);
  if ($pageurlparts[path] == "") $pageurl .= "/";

  if ($fp = @fopen($pageurl, "r")) {
    while (!feof($fp)) {
      $data .= fgets($fp, 128);
    }
    fclose($fp);
  }

//  print "<pre>";
//  print htmlentities($data);  

//  eregi("<title>(.*)</title>", $data, $title);
//  $channel_title = $title[1];

  $channel_title = "";
  if (preg_match('/<title>(.+?)<\/title>/i', $data, $regs) > 0) { $channel_title = $regs[1];
  }

  
  if (preg_match('/<meta .*description.*"(.+?)"/i', $data, $regs) > 0) { $channel_desc = $regs[1];
  }
  if ($channel_desc == "") $channel_desc = $pageurl;

  $match_count = preg_match_all($itemregexp, $data, $items);
  $match_count = ($match_count > 25) ? 25 : $match_count;
  
  header("Content-Type: text/xml");

  $output .= "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" ?>\n";
  $output .= "<!-- generator=\"rssgenr8/0.92\" -->\n";
  $output .= "<!DOCTYPE rss PUBLIC \"-//W3C//ENTITIES Latin 1 for XHTML//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent\">\n";
  $output .= "<rss version=\"0.92\">\n";
  $output .= "  <channel>\n";
  $output .= "    <title>". htmlentities(strip_tags($channel_title)) ."</title>\n";
  $output .= "    <link>". htmlentities($pageurl) ."</link>\n";
  $output .= "    <description>". htmlentities($channel_desc) ."</description>\n";
  $output .= "    <webMaster>". htmlentities("webmaster") ."</webMaster>\n";
  $output .= "    <generator>". htmlentities("RSSgenr8 from XMLhub.com") ."</generator>\n";
  $output .= "    <language>en</language>\n";

  for ($i=0; $i< $match_count; $i++) {

    $desc = $items[1][$i];
    $title = wsstrip($desc);
    $descout = $desc;
    

      if (preg_match("/(.+?)(?:<\/P|<\/div|<br|<\/h|<\/td)/i", $title, $regs) > 0) { 
        $title = $regs[1];
        if (strlen(wsstrip(trim(strip_tags($title)))) < 100) {
          $descout = str_replace($title,"",$descout);
        }
      }
    
    $title = wsstrip(trim(strip_tags($title)));
    if (strlen($title) > 100) {
      $title = substr($title,0,100) . " ...";
    }


    
    $item_url = get_link($desc, $pageurl);
    $descout = wsstrip(strip_tags($descout, $allowable_tags));
      $pos = strpos($descout, "<br>");
      if (is_int($pos) and ($pos == 0)) {
        $descout=substr($descout, 4);
      }  
      $pos = strpos($descout, "<br />");
      if (is_int($pos) and ($pos == 0)) {
        $descout=substr($descout, 6);
      }

    $descout = htmlentities(wsstrip($descout));

    $output .= "    <item>\n";
    $output .= "      <title>". htmlentities($title) ."</title>\n";
    $output .= "      <link>". htmlentities($item_url) ."</link>\n";
    $output .= "      <description>". $descout ."</description>\n";
    $output .= "    </item>\n";
  }

  $output .= "  </channel>\n";
  $output .= "</rss>\n";

  print $output;
//  print htmlentities($output);
//  print "</pre>"; 
}

function get_link($desc, $pageurl) {
  if (stristr($desc, "href")) {
    $linkurl = stristr($desc, "href");
    $linkurl = substr($linkurl, strpos($linkurl, "\"")+1);
    $linkurl = substr($linkurl, 0, strpos($linkurl, "\""));
    $linkurl = trim($linkurl);
    $pageurlarray = parse_url($linkurl);
    if (empty($pageurlarray['host'])) {
      $linkurl = make_abs($linkurl, $pageurl);
    }
    return $linkurl;
  } else {
    return $pageurl;
  }
}

function wsstrip($str)
{
$str=ereg_replace("[\r\t\n]"," ",$str);
$str=ereg_replace (' +', ' ', trim($str));
return $str;
}


function make_abs($rel_uri, $base, $REMOVE_LEADING_DOTS = true) { 
preg_match("'^([^:]+://[^/]+)/'", $base, $m); 
$base_start = $m[1]; 
if (preg_match("'^/'", $rel_uri)) { 
  return $base_start . $rel_uri; 
} 
$base = preg_replace("{[^/]+$}", '', $base); 
$base .= $rel_uri; 
$base = preg_replace("{^[^:]+://[^/]+}", '', $base); 
$base_array = explode('/', $base); 
if (count($base_array) and!strlen($base_array[0])) 
  array_shift($base_array); 
$i = 1; 
while ($i < count($base_array)) { 
  if ($base_array[$i - 1] == ".") { 
   array_splice($base_array, $i - 1, 1); 
   if ($i > 1) $i--; 
  } elseif ($base_array[$i] == ".." and $base_array[$i - 1]!= "..") { 
   array_splice($base_array, $i - 1, 2); 
   if ($i > 1) { 
$i--; 
if ($i == count($base_array)) array_push($base_array, ""); 
   } 
  } else { 
   $i++; 
  } 
} 
if (count($base_array) and $base_array[-1] == ".") 
  $base_array[-1] = ""; 

if ($REMOVE_LEADING_DOTS) { 
  while (count($base_array) and preg_match("/^\.\.?$/", $base_array[0])) { 
   array_shift($base_array); 
  } 
} 
return($base_start . '/' . implode("/", $base_array)); 
}

?>

  • 2 months later...
This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.