savagenoob Posted January 29, 2011 Share Posted January 29, 2011 OK, I have the initial cURL working but need to figure out how to extract data I want off that webpage to display or store in a database, I tried using dom and xpath, but because of the way the page displays using css, i think its not picking it up. Here is my cURL script: <?php $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)'; $target_url = "www.test.com"; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL,$target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html = curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); $dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//td"); for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); storeLink($url,$target_url); echo "<br />Link stored: $url"; } ?> and here is a snippet of the source of the page I am getting: <span id="lblTest"><h1 id='surrZipTitle'>Agents in Surrounding Zip Codes</h1><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>4444 MANZANITA AVE STE 6</span><br /><span>CARMICHAEL , CA 95608-1488</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415199904150295&lat=38.646142&lng=-121.327623' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a> <a class='faaBlueLink' id='lnkWebSite' style='display: none;' href='http://' target='_blank' onclick="return trackEvent('/External-Link/AgentWebsite/ ','PROFESSIONAL INS ASSOC, INC. ');">Website</a></div></td><td valign='top'> </td></tr></table></td></tr></table><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>AMERICAN AIM AUTO INS AGY, INC</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS , CA 95628-3318</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415911704151222&lat=38.66237&lng=-121.292429' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a> So basically I want to extract the agency name like "<span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span>" and the address which always use the same div class like "caaAgencyName2" and "addressBlock". How can this be accomplished? Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/ Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 I am trying: <?php $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)'; header('Content-type: text/xml; charset=utf-8', true); $target_url = "test.com"; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL,$target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html = curl_exec($ch); $html = @mb_convert_encoding($html, 'HTML-ENTITIES', 'utf-8'); curl_close( $ch ); $char = "<span class=(\"|\'|)caaAgencyName2 addwidth(\"|\'|)>(.*?)</span>"; echo preg_match($char, $html); ?> but its coming up blank... I think I am close Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/#findComment-1167143 Share on other sites More sharing options...
BlueSkyIS Posted January 30, 2011 Share Posted January 30, 2011 do you want ALL matches, or will there only be 1? i assume you want all (preg_match_all), but preg_match should get just the first one. $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $content, $matches); print_r($matches); edited to further simplify regular expression Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/#findComment-1167146 Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 This is actually working, but I am getting a wierd array... Array ( [0] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) [1] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [2] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [3] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) ) and how do I incorporate <div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS , CA 95628-3318</span> of each result into this regex? Thank you for your help. Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/#findComment-1167157 Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 OK, I can sort out the agency names, but still need to modify the regex or run another one to extract the addresses and assign them to the agency name, I tried: $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $html, $matches); sort($matches); foreach( $matches[2] as $key => $value){ echo "Agency Name: $value <br />"; } $addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><\/br>/'; $addyres = preg_match_all($addressspan, $html, $matchesadd); print_r($matchesadd); but print_r($matchesadd); is comin up blank... Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/#findComment-1167170 Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 OK, this regex works : $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $html, $matches); foreach( $matches[0] as $key => $value){ echo "Agency Name: $value <br />"; } $addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/'; $addyres = preg_match_all($addressspan, $html, $matchesadd); foreach( $matchesadd[3] as $key => $value){ echo "Address: $value <br />"; } but how do I combine them now... hmmm... Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/#findComment-1167181 Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 This worked, thanks. $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $html, $matches); $data1 = $matches[0]; $addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/'; $addyres = preg_match_all($addressspan, $html, $matchesadd); $data2 = $matchesadd[3]; $data = array_combine($data1, $data2); foreach( $data as $key => $value){ ?> <tr> <td>Agency:</td><td><?php echo $key;?></td><td><?php echo $value; ?></td> </tr> <?php } Link to comment https://forums.phpfreaks.com/topic/226088-using-curl-to-get-data-off-a-website-help-please/#findComment-1167187 Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.