Jump to content

Why do I get different results with this code than with my browser?


korail

Recommended Posts

Hello,

 

I have the following code which is designed to visit several URLs, save the html for futher use (parsing for data) and move onto the next one. The problem I'm having is when you manually type in one of the URLs into your browser you get one of 3 different outcomes.

 

First outcome http://tiny.cc/gf07f is what happens when you first enter one of the URL's, you get a login screen. Then if you refresh the URL or enter another similar URL at the same domain you get either http://tiny.cc/p40yy or http://tiny.cc/1f7u5. (These URL's have a four digit number on the end which is for a locomotive running on the Korean National Railroad, and depending on whether the locomotive is working a train you get http://tiny.cc/1f7u5 or not you get http://tiny.cc/1f7u5).

 

Now, the problem I'm having is that if I run the code listed below I only get code back for this outcome http://tiny.cc/gf07f, not any of the others. The code below is only set-up for testing purposes for one URL at a time. Does anyone know why I can get either of the two outcomes http://tiny.cc/1f7u5 or http://tiny.cc/1f7u5 when I manually enter URL's into the browser but only this outcome http://tiny.cc/gf07f with the code?

 

<?php

// RUN TEST
$test_url1 = 'http://logis.korail.go.kr/getcarinfo.do?car_no=7001';
$test_contents1 = file_get_contents($test_url1);
echo strlen ($test_contents1);
echo '<br>';
sleep(5);
$test_url2 = 'http://logis.korail.go.kr/getcarinfo.do?car_no=7002';
$test_contents2 = file_get_contents($test_url2);
echo strlen ($test_contents2);
echo '<br>';


// FLEET TO BE LOOKED UP
$loco[] = "7001";/*
$loco[] = "7002";
$loco[] = "7003";
$loco[] = "7004";
$loco[] = "7005";
$loco[] = "7006";
$loco[] = "7007";
$loco[] = "7008";
$loco[] = "7009";
$loco[] = "7010";
$loco[] = "7011";
$loco[] = "7012";
$loco[] = "7013";
$loco[] = "7014";
$loco[] = "7015";*/


// GET URL DATA
function request_callback($response, $info) {

	// WRITE RESPONSE DATA TO FILE
	$myFile = "testFile.txt";
	$fh = fopen($myFile, 'w') or die("can't open file");
	$stringData = $response;
	fwrite($fh, $stringData);
	fclose($fh);
	}


// REQUIRE ROLLING CURL
require("RollingCurl.php");


// POPULATE URL ARRAY
$url_p1 = 'http://logis.korail.go.kr/getcarinfo.do?car_no=';
$url_p2 = '&cntr_no=';
for ($i=0;$i<sizeof($loco);$i++){
$urls[]=$url_p1.$loco[$i].$url_p2;
}


// FETCH URLS
$rc = new RollingCurl("request_callback");
$rc->window_size = 20;
foreach ($urls as $url) {
    	$request = new Request($url);
    $rc->add($request);
}
$rc->execute();

?>

 

<?php

/*
Authored by Josh Fraser (www.joshfraser.com)
Released under Apache License 2.0

Maintained by Alexander Makarov, http://rmcreative.ru/

$Id$
*/

/**
* Class that represent a single curl request
*/
class Request {
        public $url = false;
        public $method = 'GET';
        public $post_data = null;
        public $headers = null;
        public $options = null;

    /**
     * @param string $url
     * @param string $method
     * @param  $post_data
     * @param  $headers
     * @param  $options
     * @return void
     */
    function __construct($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
        $this->url = $url;
        $this->method = $method;
        $this->post_data = $post_data;
        $this->headers = $headers;
        $this->options = $options;
    }
}

/**
* RollingCurl custom exception
*/
class RollingCurlException extends Exception {}

/**
* Class that holds a rolling queue of curl requests.
*
* @throws RollingCurlException
*/
class RollingCurl {
    /**
     * @var int
     *
     * Window size is the max number of simultaneous connections allowed.
         *
     * REMEMBER TO RESPECT THE SERVERS:
     * Sending too many requests at one time can easily be perceived
     * as a DOS attack. Increase this window_size if you are making requests
     * to multiple servers or have permission from the receving server admins.
     */
    private $window_size = 5;

    /**
     * @var string|array
     *
     * Callback function to be applied to each result.
     */
    private $callback;

    /**
     * @var array
     *
     * Set your base options that you want to be used with EVERY request.
     */
    protected $options = array(
                CURLOPT_SSL_VERIFYPEER => 0,
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_CONNECTTIMEOUT => 30,
        CURLOPT_TIMEOUT => 30
        );
       
    /**
     * @var array
     */
    private $headers = array();

    /**
     * @var Request[]
     *
     * The request queue
     */
    private $requests = array();

    /**
     * @param  $callback
     * Callback function to be applied to each result.
     *
     * Can be specified as 'my_callback_function'
     * or array($object, 'my_callback_method').
     *
     * Function should take two parameters: $response, $info.
     * $response is response body, $info is additional curl info.
     *
     * @return void
     */
        function __construct($callback = null) {
        $this->callback = $callback;
    }

    /**
     * @param string $name
     * @return mixed
     */
    public function __get($name) {
        return (isset($this->{$name})) ? $this->{$name} : null;
    }

    /**
     * @param string $name
     * @param mixed $value
     * @return bool
     */
    public function __set($name, $value){
        // append the base options & headers
        if ($name == "options" || $name == "headers") {
            $this->{$name} = $this->{$name} + $value;
        } else {
            $this->{$name} = $value;
        }
        return true;
    }

    /**
     * Add a request to the request queue
     *
     * @param Request $request
     * @return bool
     */
    public function add($request) {
         $this->requests[] = $request;
         return true;
    }

    /**
     * Create new Request and add it to the request queue
     *
     * @param string $url
     * @param string $method
     * @param  $post_data
     * @param  $headers
     * @param  $options
     * @return bool
     */
    public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
         $this->requests[] = new Request($url, $method, $post_data, $headers, $options);
         return true;
    }

    /**
     * Perform GET request
     *
     * @param string $url
     * @param  $headers
     * @param  $options
     * @return bool
     */
    public function get($url, $headers = null, $options = null) {
        return $this->request($url, "GET", null, $headers, $options);
    }

    /**
     * Perform POST request
     *
     * @param string $url
     * @param  $post_data
     * @param  $headers
     * @param  $options
     * @return bool
     */
    public function post($url, $post_data = null, $headers = null, $options = null) {
        return $this->request($url, "POST", $post_data, $headers, $options);
    }

    /**
     * Execute the curl
     *
     * @param int $window_size Max number of simultaneous connections
     * @return string|bool
     */
    public function execute($window_size = null) {
        // rolling curl window must always be greater than 1
        if (sizeof($this->requests) == 1) {
            return $this->single_curl();
        } else {
            // start the rolling curl. window_size is the max number of simultaneous connections
            return $this->rolling_curl($window_size);
        }
    }

    /**
     * Performs a single curl request
     *
     * @access private
     * @return string
     */
    private function single_curl() {
        $ch = curl_init();              
        $options = $this->get_options(array_shift($this->requests));
        curl_setopt_array($ch,$options);
        $output = curl_exec($ch);
        $info = curl_getinfo($ch);


        // it's not neccesary to set a callback for one-off requests
        if ($this->callback) {
            $callback = $this->callback;
            if (is_callable($this->callback)){
                call_user_func($callback, $output, $info);
            }
        }
                else
            return $output;
    }

    /**
     * Performs multiple curl requests
     *
     * @access private
     * @throws RollingCurlException
     * @param int $window_size Max number of simultaneous connections
     * @return bool
     */
    private function rolling_curl($window_size = null) {
        if ($window_size)
            $this->window_size = $window_size;

        // make sure the rolling window isn't greater than the # of urls
        if (sizeof($this->requests) < $this->window_size)
            $this->window_size = sizeof($this->requests);
       
        if ($this->window_size < 2) {
            throw new RollingCurlException("Window size must be greater than 1");
        }

        $master = curl_multi_init();        

        // start the first batch of requests
        for ($i = 0; $i < $this->window_size; $i++) {
            $ch = curl_init();

            $options = $this->get_options($this->requests[$i]);

            curl_setopt_array($ch,$options);
            curl_multi_add_handle($master, $ch);
        }

        do {
            while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
            if($execrun != CURLM_OK)
                break;
            // a request was just completed -- find out which one
            while($done = curl_multi_info_read($master)) {

                // get the info and content returned on the request
                $info = curl_getinfo($done['handle']);
                $output = curl_multi_getcontent($done['handle']);

                // send the return values to the callback function.
                $callback = $this->callback;
                if (is_callable($callback)){
                    call_user_func($callback, $output, $info);
                }

                // start a new request (it's important to do this before removing the old one)
                if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) {
                    $ch = curl_init();
                    $options = $this->get_options($this->requests[$i++]);
                    curl_setopt_array($ch,$options);
                    curl_multi_add_handle($master, $ch);
                }

                // remove the curl handle that just completed
                curl_multi_remove_handle($master, $done['handle']);

            }
        } while ($running);
        curl_multi_close($master);
        return true;
    }


    /**
     * Helper function to set up a new request by setting the appropriate options
     *
     * @access private
     * @param Request $request
     * @return array
     */
    private function get_options($request) {
        // options for this entire curl object
        $options = $this->__get('options');
                if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) {
            $options[CURLOPT_FOLLOWLOCATION] = 1;
                        $options[CURLOPT_MAXREDIRS] = 5;
        }
        $headers = $this->__get('headers');

                // append custom options for this specific request
                if ($request->options) {
            $options += $request->options;
        }

                // set the request URL
        $options[CURLOPT_URL] = $request->url;

        // posting data w/ this request?
        if ($request->post_data) {
            $options[CURLOPT_POST] = 1;
            $options[CURLOPT_POSTFIELDS] = $request->post_data;
        }
        if ($headers) {
            $options[CURLOPT_HEADER] = 0;
            $options[CURLOPT_HTTPHEADER] = $headers;
        }

        return $options;
    }

    /**
     * @return void
     */
    public function __destruct() {
        unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
        }
}


?>

Link to comment
Share on other sites

Often sites do things like set cookies or perform redirects.  Simulating a browser is non-trivial.  This is one of the reasons that people often use the curl library which has a rich set of client features.  Firebug is one tool that can help you explore what the site is actually doing in the case of each of the url's.

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.