mark107 Posted September 8, 2013 Share Posted September 8, 2013 Hi guys,I have got a problem with scraping the data from a third party website. I'm currently using a preg_match_all method with each different title tags including the values to output the data from a third party website to my website where I can see some of the data are missing.Here's what the HTML is look like from a third party: <span id="row1Time" class="zc-ssl-pg-time">9:00 AM</span> <a id="rowTitle1" class="zc-ssl-pg-title">CBS News Sunday Morning</a> <span id="row2Time" class="zc-ssl-pg-time">10:30 AM</span> <a id="rowTitle2" class="zc-ssl-pg-title">Face the Nation</a> <span id="row3Time" class="zc-ssl-pg-time">11:30 AM</span> <span id="rowTitle3" class="zc-ssl-pg-title">Local Programming</span> <span id="row4Time" class="zc-ssl-pg-time">12:00 PM</span> <a id="rowTitle4" class="zc-ssl-pg-title">The NFL Today</a> <span id="row5Time" class="zc-ssl-pg-time">1:00 PM</span> <a id="rowTitle5" class="zc-ssl-pg-title">NFL Football</a> <span id="row6Time" class="zc-ssl-pg-time">4:30 PM</span> <a id="rowTitle6" class="zc-ssl-pg-title"'>2013 U.S. Open Tennis</a> <span id="row7Time" class="zc-ssl-pg-time">7:00 PM</span> <span id="rowTitle7" class="zc-ssl-pg-title">Local Programming</span> <span id="row8Time" class="zc-ssl-pg-time">7:30 PM</span> <a id="rowTitle8" class="zc-ssl-pg-title">CBS Evening News</a> Here is the HTML output data on my website: <span id='time1'>9:00 AM</span> - <span id='title1'>CBS News Sunday Morning</span><br></br> <span id='time2'>10:30 AM</span> - <span id='title2'>Face the Nation</span><br></br> <span id='time3'></span> - <span id='title3'></span><br></br> <span id='time4'>12:00 PM</span> - <span id='title4'>The NFL Today</span><br></br> <span id='time5'>3:30 PM</span> - <span id='title5'>The Bold and the Beautiful</span><br></br> <span id='time6'>4:00 PM</span> - <span id='title6'>The Talk</span><br></br> <span id='time7'></span> - <span id='title7'></span><br></br> <span id='time8'>7:30 PM</span> - <span id='title8'>CBS Evening News</span><br></br> Here's the php code: <?php define('DB_HOST', 'localhost'); define('DB_USER', 'myusername'); define('DB_PASSWORD', 'mypassword'); define('DB_DATABASE', 'mydb'); $errmsg_arr = array(); $errflag = false; $link = mysql_connect(DB_HOST, DB_USER, DB_PASSWORD); if(!$link) { die('Failed to connect to server: ' . mysql_error()); } $db = mysql_select_db(DB_DATABASE); if(!$db) { die("Unable to select database"); } function clean($var) { return mysql_real_escape_string(strip_tags($var)); } $channels = clean($_GET['channels']); $id = clean($_GET['id']); if($errflag) { $_SESSION['ERRMSG_ARR'] = $errmsg_arr; echo implode('<br />',$errmsg_arr); } else { $insert = array(); if(isset($_GET['channels'])) { $insert[] = 'channels = \'' . clean($_GET['channels']) .'\''; } if(isset($_GET['id'])) { $insert[] = 'id = \'' . clean($_GET['id']) . '\''; } if($channels && $id) { $qrytable1="SELECT id, channels, links FROM tvguide WHERE channels='$channels' && id='$id'"; $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error()); while ($row = mysql_fetch_array($result1)) { $links = $row['links']; $data = file_get_contents($links); preg_match_all('/<span id="row1Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle1\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches1); $time1 = $matches[1]; $titles1 = $matches1[1]; echo "<span id='time1'>".$time1[1]."</span> - <span id='title1'>".$titles1[1]."</span><br></br>"; preg_match_all('/<span id="row2Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle2\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches2); $time2 = $matches[1]; $titles2 = $matches2[1]; echo "<span id='time2'>".$time2[1]."</span> - <span id='title2'>".$titles2[1]."</span><br></br>"; preg_match_all('/<span id="row3Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle3\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches3); $time3 = $matches[1]; $titles3 = $matches3[1]; echo "<span id='time3'>".$time3[1]."</span> - <span id='title3'>".$titles3[1]."</span><br></br>"; preg_match_all('/<span id="row4Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle4\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches4); $time4 = $matches[1]; $titles4 = $matches4[1]; echo "<span id='time4'>".$time4[1]."</span> - <span id='title4'>".$titles4[1]."</span><br></br>"; preg_match_all('/<span id="row5Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle5\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches5); $time5 = $matches[1]; $titles5 = $matches5[1]; echo "<span id='time5'>".$time5[1]."</span> - <span id='title5'>".$titles5[1]."</span><br></br>"; preg_match_all('/<span id="row6Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle6\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches6); $time6 = $matches[1]; $titles6 = $matches6[1]; echo "<span id='time6'>".$time6[1]."</span> - <span id='title6'>".$titles6[1]."</span><br></br>"; preg_match_all('/<span id="row7Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle7\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches7); $time7 = $matches[1]; $titles7 = $matches7[1]; echo "<span id='time7'>".$time7[1]."</span> - <span id='title7'>".$titles7[1]."</span><br></br>"; preg_match_all('/<span id="row8Time\" class="zc-ssl-pg-time">([^<]+)<\/span>[^>]+>([^<]+)<\/a>/im', $data, $matches); preg_match_all('/<a id="rowTitle8\" class="zc-ssl-pg-title"[^>]*>([^<]+)<\/a>/im', $data, $matches8); $time8 = $matches[1]; $titles8 = $matches8[1]; echo "<span id='time8'>".$time8[1]."</span> - <span id='title8'>".$titles8[1]."</span><br></br>"; } mysql_close($link); } else if(!$channels && ! $id) { $qrytable1="SELECT id, channels, links FROM tvguide"; $result1=mysql_query($qrytable1) or die('Error:<br />' . $qry . '<br />' . mysql_error()); while ($row = mysql_fetch_array($result1)) { echo "<p id='channels'>"; echo $row['channels']; echo "<p id='links'>"; echo . $row["channels"] . "&id=" . $row["id"] .'</p>'; } } } ?> Does anyone know how I can scrape the data using with the preg_match_all or similar method that I currently use including with the time and the title tags with the values so I can output the data without being missing?I tried with PHP DOM, but I have no idea how to scrape the ids and the classes.If you could post the example PHP DOM including with the ids and classes, I would be very grateful.Any advice would be much appreciated.Thanks in advance Quote Link to comment https://forums.phpfreaks.com/topic/281983-missing-data-when-scraping/ Share on other sites More sharing options...
mark107 Posted September 8, 2013 Author Share Posted September 8, 2013 ????????????????????????????????? Quote Link to comment https://forums.phpfreaks.com/topic/281983-missing-data-when-scraping/#findComment-1448734 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.