Jump to content

Opening websites with fopen/cURL


sidorak95

Recommended Posts

Hi,

 

I've been trying to find a specific text on a webpage and make a popup if it's there. A friend recommended I use cURL to try to open the page first, and then go on from there. I found a simple script on the internet:

<?php
function get_data($url)
{

  $ch = curl_init();
  $timeout = 5;
  curl_setopt($ch,CURLOPT_URL,$url);
  curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
  curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);

  $data = curl_exec($ch);
  curl_close($ch);
  return $data;
}


$returned_content = get_data('http://www.example.com/');
echo $returned_content;
?>

This works successfully on both Google and Yahoo. However, for the site in question, nothing is displayed. A quick check shows that the result is empty. I've tried setting the user-agent to Firefox, and then the webpage is now displayed.

 

The admin of that site says that no user-agent is being blocked, except for bots in one directory that I'm not trying to access.

 

Both file_get_contents and fopen result in an empty page.

 

Does anyone have any idea why this is?

 

Thanks.

Link to comment
Share on other sites

My curl script can fetch the info on the page.

Can echo the html and is there.

What you do with the html will be up to you.

 

<?php
$url = "http://www.clickcritters.com/";
/*connect to the url using curl to see if exists and get the information*/
        //$cookie = tempnam('tmp','cookie');
        //$cookie_file_path = "tmp/";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        //curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
        //curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file_path);
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3');
        curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
        curl_setopt($ch, CURLOPT_TIMEOUT, 15);
        curl_setopt($ch, CURLOPT_MAXREDIRS, 15);
        curl_setopt($ch, CURLOPT_HEADER, 1);
        curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_AUTOREFERER, true);
        curl_setopt ($ch, CURLOPT_FILETIME, 1);
        curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($ch, CURLOPT_ENCODING , "");
        $curl_session = curl_init();
        //curl_setopt($curl_session, CURLOPT_COOKIEJAR, $cookie);
        //curl_setopt($curl_session, CURLOPT_COOKIEFILE, $cookie_file_path);
        curl_setopt($curl_session, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3');
        curl_setopt($curl_session, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
        curl_setopt($curl_session, CURLOPT_ENCODING , "");
        curl_setopt($curl_session, CURLOPT_TIMEOUT, 15);
        curl_setopt($curl_session, CURLOPT_HEADER, 1);
        curl_setopt($curl_session, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($curl_session, CURLOPT_HEADER, true);
        curl_setopt($curl_session, CURLOPT_MAXREDIRS, 15);
        curl_setopt($curl_session, CURLOPT_RETURNTRANSFER, true);
        curl_setopt( $curl_session, CURLOPT_AUTOREFERER, true );
        curl_setopt ($curl_session, CURLOPT_HTTPGET, true);
        curl_setopt($curl_session, CURLOPT_URL, $url);
        $string = mysql_real_escape_string(curl_exec($curl_session));
        $html = mysql_real_escape_string(curl_exec ($ch));
        $info = curl_getinfo($ch);
        /*curl response check and to resolve url to the actual location*/
        $response = curl_getinfo( $ch );
        if ($response['http_code'] == 301 || $response['http_code'] == 302) {
            ini_set("user_agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3");
            $headers = get_headers($response['url']);

            $location = "";
            foreach( $headers as $value ) {
                if ( substr( strtolower($value), 0, 9 ) == "location:" )
                    return get_final_url( trim( substr( $value, 9, strlen($value) ) ) );
            }
        }

        if (    preg_match("/window\.location\.replace\('(.*)'\)/i", $con_addtent, $value) ||
        preg_match("/window\.location\=[\"'](.*)[\"']/i", $con_addtent, $value) ||
        preg_match("/location\.href\=[\"'](.*)[\"']/i", $con_addtent, $value)
)
       {
            $finalurl = get_final_url($value[1]);
        }
        else {

            $finalurl = $response['url'];
        }
        
        $html = curl_exec($ch);
        $header = "Location: ";
        
//echo $finalurl."<br />";
//view site
echo $html;
//stripped tags text
//echo strip_tags($html);

curl_close($ch);
?>

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.