andz Posted July 25, 2010 Share Posted July 25, 2010 Hi, I have this problem when about using loop in DOMDocument. My script will going to get the pages (name, page link, description) from google.com by scraping its content. I already have tested the script and even I query the google site usign the script, the problem is that it only returning 1 page result despite the fact that google indexed 252 pages. Here's the actual script that I've been using to scrape on google. If you could help me find a turnaround on how do I possibly implement FOR() or FOREACH() loop to get all of the pages indexed by google. class google { function curl($domain) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $domain); curl_setopt($ch, CURLOPT_HEADER, 1); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER,true); curl_setopt($ch, CURLOPT_TIMEOUT, 45); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0. Gecko/2009032609 Firefox/3.0.8'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $data = curl_exec($ch); return $data; } function googleIndexed($domain) { # Filter entered domain name by user. $domain = strtolower($domain); $domain = trim(eregi_replace('http://', '', $domain)); $domain = trim(eregi_replace('http', '', $domain)); # Prepare the correct google url. $googleURL = 'http://www.google.com/search?hl=en&lr=&ie=UTF-8&q=site:'.$domain; # Parse google site. $curlGoogle = $this->curl($googleURL); # Parse the html result into a DOMDocument $dom = new DOMDocument(); @$dom->loadHTML($curlGoogle); # Grab the contents. $xpath = new DOMXPath($dom); # Parameters to be use for grabbing contents. # Grab content. // $params = $xpath->query("//h3[@class=r]"); $params = $xpath->query("//a[@class=l]"); $res = $params->item(0); $title = $res->textContent; $this->debug($title); } function debug($str) { echo '<pre>'; echo var_dump($str); echo '</pre>'; } } just name it as google.class.php, to use the script just do like this. $google = new google(); echo $google->googleIndexed('phpfreaks.com'); just replace the phpfreaks.com with whatever domain name you can think of. If you could help me, that'll be great. Thanks and have a nice day. Link to comment https://forums.phpfreaks.com/topic/208854-loop-using-curl-and-domdocument/ Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.