Jump to content

Using cURL to get data off a website Help Please


savagenoob

Recommended Posts

OK, I have the initial cURL working but need to figure out how to extract data I want off that webpage to display or store in a database, I tried using dom and xpath, but because of the way the page displays using css, i think its not picking it up. Here is my cURL script:

<?php
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';

$target_url = "www.test.com";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html = curl_exec($ch);
if (!$html) {
echo "<br />cURL error number:" .curl_errno($ch);
echo "<br />cURL error:" . curl_error($ch);
exit;
}
// parse the html into a DOMDocument
$dom = new DOMDocument();
$dom->loadHTML($html);

// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//td");

for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
storeLink($url,$target_url);
echo "<br />Link stored: $url";
}

?>

and here is a snippet of the source of the page I am getting:

 

<span id="lblTest"><h1 id='surrZipTitle'>Agents in Surrounding Zip Codes</h1><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>4444 MANZANITA AVE STE 6</span><br /><span>CARMICHAEL                    , CA 95608-1488</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415199904150295&lat=38.646142&lng=-121.327623' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a>    <a class='faaBlueLink' id='lnkWebSite' style='display: none;' href='http://' target='_blank' onclick="return trackEvent('/External-Link/AgentWebsite/                                                                      ','PROFESSIONAL INS ASSOC, INC.                                ');">Website</a></div></td><td valign='top'> </td></tr></table></td></tr></table><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>AMERICAN AIM AUTO INS AGY, INC</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS                    , CA 95628-3318</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415911704151222&lat=38.66237&lng=-121.292429' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a>

 

So basically I want to extract the agency name like "<span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span>" and the address which always use the same div class like "caaAgencyName2" and "addressBlock".  How can this be accomplished?

 

I am trying:

<?php
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
header('Content-type: text/xml; charset=utf-8', true);

$target_url = "test.com";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html = curl_exec($ch);
$html = @mb_convert_encoding($html, 'HTML-ENTITIES', 'utf-8');

curl_close( $ch );
$char  = "<span class=(\"|\'|)caaAgencyName2 addwidth(\"|\'|)>(.*?)</span>";
echo preg_match($char, $html);
?>

but its coming up blank... I think I am close

do you want ALL matches, or will there only be 1? i assume you want all (preg_match_all), but preg_match should get just the first one.

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $content, $matches);
print_r($matches);

 

edited to further simplify regular expression

This is actually working, but I am getting a wierd array...

Array ( [0] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) [1] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [2] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [3] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) )

 

and how do I incorporate <div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS                    , CA 95628-3318</span>

 

of each result into this regex? Thank you for your help.

OK, I can sort out the agency names, but still need to modify the regex or run another one to extract the addresses and assign them to the agency name, I tried:

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $html, $matches);
sort($matches);
foreach( $matches[2] as $key => $value){
echo "Agency Name: $value <br />";
}
$addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><\/br>/';
$addyres = preg_match_all($addressspan, $html, $matchesadd);
print_r($matchesadd);

 

but print_r($matchesadd); is comin up blank...

OK, this regex works :

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $html, $matches);

foreach( $matches[0] as $key => $value){
echo "Agency Name: $value <br />";
}
$addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/';
$addyres = preg_match_all($addressspan, $html, $matchesadd);
foreach( $matchesadd[3] as $key => $value){
echo "Address: $value <br />";
}

but how do I combine them now... hmmm...

This worked, thanks.

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $html, $matches);
$data1 = $matches[0];

$addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/';
$addyres = preg_match_all($addressspan, $html, $matchesadd);
$data2 = $matchesadd[3];

$data = array_combine($data1, $data2);

foreach( $data as $key => $value){
?>

<tr>
<td>Agency:</td><td><?php echo $key;?></td><td><?php echo $value; ?></td>
</tr>
<?php
}

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.