Jump to content

Missing data when scraping


mark107

Recommended Posts

Hi guys,

I have got a problem with scraping the data from a third party website. I'm currently using a preg_match_all method with each different title tags including the values to output the data from a third party website to my website where I can see some of the data are missing.


Here's what the HTML is look like from a third party:

    <span id="row1Time" class="zc-ssl-pg-time">9:00 AM</span>
    <a id="rowTitle1" class="zc-ssl-pg-title">CBS News Sunday Morning</a>
    <span id="row2Time" class="zc-ssl-pg-time">10:30 AM</span>
    <a id="rowTitle2" class="zc-ssl-pg-title">Face the Nation</a>
    <span id="row3Time" class="zc-ssl-pg-time">11:30 AM</span>
    <span id="rowTitle3" class="zc-ssl-pg-title">Local Programming</span>
    <span id="row4Time" class="zc-ssl-pg-time">12:00 PM</span>
    <a id="rowTitle4" class="zc-ssl-pg-title">The NFL Today</a>
    <span id="row5Time" class="zc-ssl-pg-time">1:00 PM</span>
    <a id="rowTitle5" class="zc-ssl-pg-title">NFL Football</a>
    <span id="row6Time" class="zc-ssl-pg-time">4:30 PM</span>
    <a id="rowTitle6" class="zc-ssl-pg-title"'>2013 U.S. Open Tennis</a>
    <span id="row7Time" class="zc-ssl-pg-time">7:00 PM</span>
    <span id="rowTitle7" class="zc-ssl-pg-title">Local Programming</span>
    <span id="row8Time" class="zc-ssl-pg-time">7:30 PM</span>
    <a id="rowTitle8" class="zc-ssl-pg-title">CBS Evening News</a>


Here is the HTML output data on my website:
    <span id='time1'>9:00 AM</span> - <span id='title1'>CBS News Sunday Morning</span><br></br>
    <span id='time2'>10:30 AM</span> - <span id='title2'>Face the Nation</span><br></br>
    <span id='time3'></span> - <span id='title3'></span><br></br>
    <span id='time4'>12:00 PM</span> - <span id='title4'>The NFL Today</span><br></br>
    <span id='time5'>3:30 PM</span> - <span id='title5'>The Bold and the Beautiful</span><br></br>
    <span id='time6'>4:00 PM</span> - <span id='title6'>The Talk</span><br></br>
    <span id='time7'></span> - <span id='title7'></span><br></br>
    <span id='time8'>7:30 PM</span> - <span id='title8'>CBS Evening News</span><br></br>


Here's the php code:
    <?php
      define('DB_HOST', 'localhost');
      define('DB_USER', 'myusername');
      define('DB_PASSWORD', 'mypassword');
      define('DB_DATABASE', 'mydb');
          
      $errmsg_arr = array();
      $errflag = false;
      $link = mysql_connect(DB_HOST, DB_USER, DB_PASSWORD);
      
      if(!$link)
      {
        die('Failed to connect to server: ' . mysql_error());
      }
    
      $db = mysql_select_db(DB_DATABASE);
      if(!$db)
      {
        die("Unable to select database");
      }
    
      function clean($var)
      {
        return mysql_real_escape_string(strip_tags($var));
      }
      $channels = clean($_GET['channels']);
      $id = clean($_GET['id']);
      
      if($errflag)
      {
        $_SESSION['ERRMSG_ARR'] = $errmsg_arr;
        echo implode('<br />',$errmsg_arr);
      }
      else
      {
        $insert = array();
        
        if(isset($_GET['channels']))
        {
          $insert[] = 'channels = \'' . clean($_GET['channels']) .'\'';
        }
        if(isset($_GET['id']))
        {
          $insert[] = 'id = \'' . clean($_GET['id']) . '\'';
        }
        
        
        if($channels && $id)
        {
          $qrytable1="SELECT id, channels, links FROM tvguide WHERE channels='$channels' && id='$id'";
          $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error());
              
            
          while ($row = mysql_fetch_array($result1))
          {
        
            $links = $row['links'];
            $data = file_get_contents($links);
            preg_match_all('/<span id="row1Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle1\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches1);
            $time1 = $matches[1];
            $titles1 = $matches1[1];
            echo "<span id='time1'>".$time1[1]."</span> - <span id='title1'>".$titles1[1]."</span><br></br>";
    
            preg_match_all('/<span id="row2Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle2\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches2);
            $time2 = $matches[1];
            $titles2 = $matches2[1];
            echo "<span id='time2'>".$time2[1]."</span> - <span id='title2'>".$titles2[1]."</span><br></br>";
    
            preg_match_all('/<span id="row3Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle3\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches3);
            $time3 = $matches[1];
            $titles3 = $matches3[1];
            echo "<span id='time3'>".$time3[1]."</span> - <span id='title3'>".$titles3[1]."</span><br></br>";
    
            preg_match_all('/<span id="row4Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle4\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches4);
            $time4 = $matches[1];
            $titles4 = $matches4[1];
            echo "<span id='time4'>".$time4[1]."</span> - <span id='title4'>".$titles4[1]."</span><br></br>";
            
            preg_match_all('/<span id="row5Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle5\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches5);
            $time5 = $matches[1];
            $titles5 = $matches5[1];
            echo "<span id='time5'>".$time5[1]."</span> - <span id='title5'>".$titles5[1]."</span><br></br>";
            
            preg_match_all('/<span id="row6Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle6\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches6);
            $time6 = $matches[1];
            $titles6 = $matches6[1];
            echo "<span id='time6'>".$time6[1]."</span> - <span id='title6'>".$titles6[1]."</span><br></br>";
            
            preg_match_all('/<span id="row7Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle7\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches7);
            $time7 = $matches[1];
            $titles7 = $matches7[1];
            echo "<span id='time7'>".$time7[1]."</span> - <span id='title7'>".$titles7[1]."</span><br></br>";
            
            preg_match_all('/<span id="row8Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches);
            preg_match_all('/<a id="rowTitle8\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches8);
            $time8 = $matches[1];
            $titles8 = $matches8[1];
            echo "<span id='time8'>".$time8[1]."</span> - <span id='title8'>".$titles8[1]."</span><br></br>";
     }
          mysql_close($link);
        }
        else if(!$channels && ! $id)
        {
          $qrytable1="SELECT id, channels, links FROM tvguide";
          $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error());
         
          while ($row = mysql_fetch_array($result1))
          {
            echo "<p id='channels'>";
            echo $row['channels'];
            echo "<p id='links'>";
            echo . $row["channels"] . "&id=" . $row["id"] .'</p>';
          }
        }
      }
    ?>



Does anyone know how I can scrape the data using with the preg_match_all or similar method that I currently use including with the time and the title tags with the values so I can output the data without being missing?

I tried with PHP DOM, but I have no idea how to scrape the ids and the classes.

If you could post the example PHP DOM including with the ids and classes, I would be very grateful.

Any advice would be much appreciated.

Thanks in advance

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.