Jump to content

Missing data when scraping


Recommended Posts

Hi guys,

I have got a problem with scraping the data from a third party website. I'm currently using a preg_match_all method with each different title tags including the values to output the data from a third party website to my website where I can see some of the data are missing.

Here's what the HTML is look like from a third party:

    <span id="row1Time" class="zc-ssl-pg-time">9:00 AM</span>
    <a id="rowTitle1" class="zc-ssl-pg-title">CBS News Sunday Morning</a>
    <span id="row2Time" class="zc-ssl-pg-time">10:30 AM</span>
    <a id="rowTitle2" class="zc-ssl-pg-title">Face the Nation</a>
    <span id="row3Time" class="zc-ssl-pg-time">11:30 AM</span>
    <span id="rowTitle3" class="zc-ssl-pg-title">Local Programming</span>
    <span id="row4Time" class="zc-ssl-pg-time">12:00 PM</span>
    <a id="rowTitle4" class="zc-ssl-pg-title">The NFL Today</a>
    <span id="row5Time" class="zc-ssl-pg-time">1:00 PM</span>
    <a id="rowTitle5" class="zc-ssl-pg-title">NFL Football</a>
    <span id="row6Time" class="zc-ssl-pg-time">4:30 PM</span>
    <a id="rowTitle6" class="zc-ssl-pg-title"'>2013 U.S. Open Tennis</a>
    <span id="row7Time" class="zc-ssl-pg-time">7:00 PM</span>
    <span id="rowTitle7" class="zc-ssl-pg-title">Local Programming</span>
    <span id="row8Time" class="zc-ssl-pg-time">7:30 PM</span>
    <a id="rowTitle8" class="zc-ssl-pg-title">CBS Evening News</a>

Here is the HTML output data on my website:
    <span id='time1'>9:00 AM</span> - <span id='title1'>CBS News Sunday Morning</span><br></br>
    <span id='time2'>10:30 AM</span> - <span id='title2'>Face the Nation</span><br></br>
    <span id='time3'></span> - <span id='title3'></span><br></br>
    <span id='time4'>12:00 PM</span> - <span id='title4'>The NFL Today</span><br></br>
    <span id='time5'>3:30 PM</span> - <span id='title5'>The Bold and the Beautiful</span><br></br>
    <span id='time6'>4:00 PM</span> - <span id='title6'>The Talk</span><br></br>
    <span id='time7'></span> - <span id='title7'></span><br></br>
    <span id='time8'>7:30 PM</span> - <span id='title8'>CBS Evening News</span><br></br>

Here's the php code:
      define('DB_HOST', 'localhost');
      define('DB_USER', 'myusername');
      define('DB_PASSWORD', 'mypassword');
      define('DB_DATABASE', 'mydb');
      $errmsg_arr = array();
      $errflag = false;
      $link = mysql_connect(DB_HOST, DB_USER, DB_PASSWORD);
        die('Failed to connect to server: ' . mysql_error());
      $db = mysql_select_db(DB_DATABASE);
        die("Unable to select database");
      function clean($var)
        return mysql_real_escape_string(strip_tags($var));
      $channels = clean($_GET['channels']);
      $id = clean($_GET['id']);
        $_SESSION['ERRMSG_ARR'] = $errmsg_arr;
        echo implode('<br />',$errmsg_arr);
        $insert = array();
          $insert[] = 'channels = \'' . clean($_GET['channels']) .'\'';
          $insert[] = 'id = \'' . clean($_GET['id']) . '\'';
        if($channels && $id)
          $qrytable1="SELECT id, channels, links FROM tvguide WHERE channels='$channels' && id='$id'";
          $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error());
          while ($row = mysql_fetch_array($result1))
            $links = $row['links'];
            $data = file_get_contents($links);
            preg_match_all('/<span id="row1Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle1\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches1);
            $time1 = $matches[1];
            $titles1 = $matches1[1];
            echo "<span id='time1'>".$time1[1]."</span> - <span id='title1'>".$titles1[1]."</span><br></br>";
            preg_match_all('/<span id="row2Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle2\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches2);
            $time2 = $matches[1];
            $titles2 = $matches2[1];
            echo "<span id='time2'>".$time2[1]."</span> - <span id='title2'>".$titles2[1]."</span><br></br>";
            preg_match_all('/<span id="row3Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle3\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches3);
            $time3 = $matches[1];
            $titles3 = $matches3[1];
            echo "<span id='time3'>".$time3[1]."</span> - <span id='title3'>".$titles3[1]."</span><br></br>";
            preg_match_all('/<span id="row4Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle4\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches4);
            $time4 = $matches[1];
            $titles4 = $matches4[1];
            echo "<span id='time4'>".$time4[1]."</span> - <span id='title4'>".$titles4[1]."</span><br></br>";
            preg_match_all('/<span id="row5Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle5\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches5);
            $time5 = $matches[1];
            $titles5 = $matches5[1];
            echo "<span id='time5'>".$time5[1]."</span> - <span id='title5'>".$titles5[1]."</span><br></br>";
            preg_match_all('/<span id="row6Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle6\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches6);
            $time6 = $matches[1];
            $titles6 = $matches6[1];
            echo "<span id='time6'>".$time6[1]."</span> - <span id='title6'>".$titles6[1]."</span><br></br>";
            preg_match_all('/<span id="row7Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle7\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches7);
            $time7 = $matches[1];
            $titles7 = $matches7[1];
            echo "<span id='time7'>".$time7[1]."</span> - <span id='title7'>".$titles7[1]."</span><br></br>";
            preg_match_all('/<span id="row8Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle8\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches8);
            $time8 = $matches[1];
            $titles8 = $matches8[1];
            echo "<span id='time8'>".$time8[1]."</span> - <span id='title8'>".$titles8[1]."</span><br></br>";
        else if(!$channels && ! $id)
          $qrytable1="SELECT id, channels, links FROM tvguide";
          $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error());
          while ($row = mysql_fetch_array($result1))
            echo "<p id='channels'>";
            echo $row['channels'];
            echo "<p id='links'>";
            echo . $row["channels"] . "&id=" . $row["id"] .'</p>';

Does anyone know how I can scrape the data using with the preg_match_all or similar method that I currently use including with the time and the title tags with the values so I can output the data without being missing?

I tried with PHP DOM, but I have no idea how to scrape the ids and the classes.

If you could post the example PHP DOM including with the ids and classes, I would be very grateful.

Any advice would be much appreciated.

Thanks in advance

Link to post
Share on other sites


This topic is now archived and is closed to further replies.

  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.