savagenoob Posted January 29, 2011 Share Posted January 29, 2011 OK, I have the initial cURL working but need to figure out how to extract data I want off that webpage to display or store in a database, I tried using dom and xpath, but because of the way the page displays using css, i think its not picking it up. Here is my cURL script: <?php $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)'; $target_url = "www.test.com"; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL,$target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html = curl_exec($ch); if (!$html) { echo "<br />cURL error number:" .curl_errno($ch); echo "<br />cURL error:" . curl_error($ch); exit; } // parse the html into a DOMDocument $dom = new DOMDocument(); $dom->loadHTML($html); // grab all the on the page $xpath = new DOMXPath($dom); $hrefs = $xpath->evaluate("/html/body//td"); for ($i = 0; $i < $hrefs->length; $i++) { $href = $hrefs->item($i); $url = $href->getAttribute('href'); storeLink($url,$target_url); echo "<br />Link stored: $url"; } ?> and here is a snippet of the source of the page I am getting: <span id="lblTest"><h1 id='surrZipTitle'>Agents in Surrounding Zip Codes</h1><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>4444 MANZANITA AVE STE 6</span><br /><span>CARMICHAEL , CA 95608-1488</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415199904150295&lat=38.646142&lng=-121.327623' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a> <a class='faaBlueLink' id='lnkWebSite' style='display: none;' href='http://' target='_blank' onclick="return trackEvent('/External-Link/AgentWebsite/ ','PROFESSIONAL INS ASSOC, INC. ');">Website</a></div></td><td valign='top'> </td></tr></table></td></tr></table><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>AMERICAN AIM AUTO INS AGY, INC</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS , CA 95628-3318</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415911704151222&lat=38.66237&lng=-121.292429' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a> So basically I want to extract the agency name like "<span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span>" and the address which always use the same div class like "caaAgencyName2" and "addressBlock". How can this be accomplished? Quote Link to comment Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 I am trying: <?php $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)'; header('Content-type: text/xml; charset=utf-8', true); $target_url = "test.com"; $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL,$target_url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html = curl_exec($ch); $html = @mb_convert_encoding($html, 'HTML-ENTITIES', 'utf-8'); curl_close( $ch ); $char = "<span class=(\"|\'|)caaAgencyName2 addwidth(\"|\'|)>(.*?)</span>"; echo preg_match($char, $html); ?> but its coming up blank... I think I am close Quote Link to comment Share on other sites More sharing options...
BlueSkyIS Posted January 30, 2011 Share Posted January 30, 2011 do you want ALL matches, or will there only be 1? i assume you want all (preg_match_all), but preg_match should get just the first one. $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $content, $matches); print_r($matches); edited to further simplify regular expression Quote Link to comment Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 This is actually working, but I am getting a wierd array... Array ( [0] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) [1] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [2] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [3] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) ) and how do I incorporate <div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS , CA 95628-3318</span> of each result into this regex? Thank you for your help. Quote Link to comment Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 OK, I can sort out the agency names, but still need to modify the regex or run another one to extract the addresses and assign them to the agency name, I tried: $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $html, $matches); sort($matches); foreach( $matches[2] as $key => $value){ echo "Agency Name: $value <br />"; } $addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><\/br>/'; $addyres = preg_match_all($addressspan, $html, $matchesadd); print_r($matchesadd); but print_r($matchesadd); is comin up blank... Quote Link to comment Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 OK, this regex works : $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $html, $matches); foreach( $matches[0] as $key => $value){ echo "Agency Name: $value <br />"; } $addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/'; $addyres = preg_match_all($addressspan, $html, $matchesadd); foreach( $matchesadd[3] as $key => $value){ echo "Address: $value <br />"; } but how do I combine them now... hmmm... Quote Link to comment Share on other sites More sharing options...
savagenoob Posted January 30, 2011 Author Share Posted January 30, 2011 This worked, thanks. $char = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/'; $presult = preg_match_all($char, $html, $matches); $data1 = $matches[0]; $addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/'; $addyres = preg_match_all($addressspan, $html, $matchesadd); $data2 = $matchesadd[3]; $data = array_combine($data1, $data2); foreach( $data as $key => $value){ ?> <tr> <td>Agency:</td><td><?php echo $key;?></td><td><?php echo $value; ?></td> </tr> <?php } Quote Link to comment Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.