Jump to content

Missing data when scraping


mark107

Recommended Posts

Hi guys,

I have got a problem with scraping the data from a third party website. I'm currently using a preg_match_all method with each different title tags including the values to output the data from a third party website to my website where I can see some of the data are missing.


Here's what the HTML is look like from a third party:

    <span id="row1Time" class="zc-ssl-pg-time">9:00 AM</span>
    <a id="rowTitle1" class="zc-ssl-pg-title">CBS News Sunday Morning</a>
    <span id="row2Time" class="zc-ssl-pg-time">10:30 AM</span>
    <a id="rowTitle2" class="zc-ssl-pg-title">Face the Nation</a>
    <span id="row3Time" class="zc-ssl-pg-time">11:30 AM</span>
    <span id="rowTitle3" class="zc-ssl-pg-title">Local Programming</span>
    <span id="row4Time" class="zc-ssl-pg-time">12:00 PM</span>
    <a id="rowTitle4" class="zc-ssl-pg-title">The NFL Today</a>
    <span id="row5Time" class="zc-ssl-pg-time">1:00 PM</span>
    <a id="rowTitle5" class="zc-ssl-pg-title">NFL Football</a>
    <span id="row6Time" class="zc-ssl-pg-time">4:30 PM</span>
    <a id="rowTitle6" class="zc-ssl-pg-title"'>2013 U.S. Open Tennis</a>
    <span id="row7Time" class="zc-ssl-pg-time">7:00 PM</span>
    <span id="rowTitle7" class="zc-ssl-pg-title">Local Programming</span>
    <span id="row8Time" class="zc-ssl-pg-time">7:30 PM</span>
    <a id="rowTitle8" class="zc-ssl-pg-title">CBS Evening News</a>


Here is the HTML output data on my website:
    <span id='time1'>9:00 AM</span> - <span id='title1'>CBS News Sunday Morning</span><br></br>
    <span id='time2'>10:30 AM</span> - <span id='title2'>Face the Nation</span><br></br>
    <span id='time3'></span> - <span id='title3'></span><br></br>
    <span id='time4'>12:00 PM</span> - <span id='title4'>The NFL Today</span><br></br>
    <span id='time5'>3:30 PM</span> - <span id='title5'>The Bold and the Beautiful</span><br></br>
    <span id='time6'>4:00 PM</span> - <span id='title6'>The Talk</span><br></br>
    <span id='time7'></span> - <span id='title7'></span><br></br>
    <span id='time8'>7:30 PM</span> - <span id='title8'>CBS Evening News</span><br></br>


Here's the php code:
    <?php
      define('DB_HOST', 'localhost');
      define('DB_USER', 'myusername');
      define('DB_PASSWORD', 'mypassword');
      define('DB_DATABASE', 'mydb');
          
      $errmsg_arr = array();
      $errflag = false;
      $link = mysql_connect(DB_HOST, DB_USER, DB_PASSWORD);
      
      if(!$link)
      {
        die('Failed to connect to server: ' . mysql_error());
      }
    
      $db = mysql_select_db(DB_DATABASE);
      if(!$db)
      {
        die("Unable to select database");
      }
    
      function clean($var)
      {
        return mysql_real_escape_string(strip_tags($var));
      }
      $channels = clean($_GET['channels']);
      $id = clean($_GET['id']);
      
      if($errflag)
      {
        $_SESSION['ERRMSG_ARR'] = $errmsg_arr;
        echo implode('<br />',$errmsg_arr);
      }
      else
      {
        $insert = array();
        
        if(isset($_GET['channels']))
        {
          $insert[] = 'channels = \'' . clean($_GET['channels']) .'\'';
        }
        if(isset($_GET['id']))
        {
          $insert[] = 'id = \'' . clean($_GET['id']) . '\'';
        }
        
        
        if($channels && $id)
        {
          $qrytable1="SELECT id, channels, links FROM tvguide WHERE channels='$channels' && id='$id'";
          $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error());
              
            
          while ($row = mysql_fetch_array($result1))
          {
        
            $links = $row['links'];
            $data = file_get_contents($links);
            preg_match_all('/<span id="row1Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle1\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches1);
            $time1 = $matches[1];
            $titles1 = $matches1[1];
            echo "<span id='time1'>".$time1[1]."</span> - <span id='title1'>".$titles1[1]."</span><br></br>";
    
            preg_match_all('/<span id="row2Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle2\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches2);
            $time2 = $matches[1];
            $titles2 = $matches2[1];
            echo "<span id='time2'>".$time2[1]."</span> - <span id='title2'>".$titles2[1]."</span><br></br>";
    
            preg_match_all('/<span id="row3Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle3\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches3);
            $time3 = $matches[1];
            $titles3 = $matches3[1];
            echo "<span id='time3'>".$time3[1]."</span> - <span id='title3'>".$titles3[1]."</span><br></br>";
    
            preg_match_all('/<span id="row4Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle4\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches4);
            $time4 = $matches[1];
            $titles4 = $matches4[1];
            echo "<span id='time4'>".$time4[1]."</span> - <span id='title4'>".$titles4[1]."</span><br></br>";
            
            preg_match_all('/<span id="row5Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle5\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches5);
            $time5 = $matches[1];
            $titles5 = $matches5[1];
            echo "<span id='time5'>".$time5[1]."</span> - <span id='title5'>".$titles5[1]."</span><br></br>";
            
            preg_match_all('/<span id="row6Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle6\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches6);
            $time6 = $matches[1];
            $titles6 = $matches6[1];
            echo "<span id='time6'>".$time6[1]."</span> - <span id='title6'>".$titles6[1]."</span><br></br>";
            
            preg_match_all('/<span id="row7Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle7\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches7);
            $time7 = $matches[1];
            $titles7 = $matches7[1];
            echo "<span id='time7'>".$time7[1]."</span> - <span id='title7'>".$titles7[1]."</span><br></br>";
            
            preg_match_all('/<span id="row8Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle8\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches8);
            $time8 = $matches[1];
            $titles8 = $matches8[1];
            echo "<span id='time8'>".$time8[1]."</span> - <span id='title8'>".$titles8[1]."</span><br></br>";
     }
          mysql_close($link);
        }
        else if(!$channels && ! $id)
        {
          $qrytable1="SELECT id, channels, links FROM tvguide";
          $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error());
         
          while ($row = mysql_fetch_array($result1))
          {
            echo "<p id='channels'>";
            echo $row['channels'];
            echo "<p id='links'>";
            echo . $row["channels"] . "&id=" . $row["id"] .'</p>';
          }
        }
      }
    ?>



Does anyone know how I can scrape the data using with the preg_match_all or similar method that I currently use including with the time and the title tags with the values so I can output the data without being missing?

I tried with PHP DOM, but I have no idea how to scrape the ids and the classes.

If you could post the example PHP DOM including with the ids and classes, I would be very grateful.

Any advice would be much appreciated.

Thanks in advance

Link to comment
https://forums.phpfreaks.com/topic/281983-missing-data-when-scraping/
Share on other sites

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.