korail Posted June 20, 2010 Share Posted June 20, 2010 Hello, I have the following code which is designed to visit several URLs, save the html for futher use (parsing for data) and move onto the next one. The problem I'm having is when you manually type in one of the URLs into your browser you get one of 3 different outcomes. First outcome http://tiny.cc/gf07f is what happens when you first enter one of the URL's, you get a login screen. Then if you refresh the URL or enter another similar URL at the same domain you get either http://tiny.cc/p40yy or http://tiny.cc/1f7u5. (These URL's have a four digit number on the end which is for a locomotive running on the Korean National Railroad, and depending on whether the locomotive is working a train you get http://tiny.cc/1f7u5 or not you get http://tiny.cc/1f7u5). Now, the problem I'm having is that if I run the code listed below I only get code back for this outcome http://tiny.cc/gf07f, not any of the others. The code below is only set-up for testing purposes for one URL at a time. Does anyone know why I can get either of the two outcomes http://tiny.cc/1f7u5 or http://tiny.cc/1f7u5 when I manually enter URL's into the browser but only this outcome http://tiny.cc/gf07f with the code? <?php // RUN TEST $test_url1 = 'http://logis.korail.go.kr/getcarinfo.do?car_no=7001'; $test_contents1 = file_get_contents($test_url1); echo strlen ($test_contents1); echo '<br>'; sleep(5); $test_url2 = 'http://logis.korail.go.kr/getcarinfo.do?car_no=7002'; $test_contents2 = file_get_contents($test_url2); echo strlen ($test_contents2); echo '<br>'; // FLEET TO BE LOOKED UP $loco[] = "7001";/* $loco[] = "7002"; $loco[] = "7003"; $loco[] = "7004"; $loco[] = "7005"; $loco[] = "7006"; $loco[] = "7007"; $loco[] = "7008"; $loco[] = "7009"; $loco[] = "7010"; $loco[] = "7011"; $loco[] = "7012"; $loco[] = "7013"; $loco[] = "7014"; $loco[] = "7015";*/ // GET URL DATA function request_callback($response, $info) { // WRITE RESPONSE DATA TO FILE $myFile = "testFile.txt"; $fh = fopen($myFile, 'w') or die("can't open file"); $stringData = $response; fwrite($fh, $stringData); fclose($fh); } // REQUIRE ROLLING CURL require("RollingCurl.php"); // POPULATE URL ARRAY $url_p1 = 'http://logis.korail.go.kr/getcarinfo.do?car_no='; $url_p2 = '&cntr_no='; for ($i=0;$i<sizeof($loco);$i++){ $urls[]=$url_p1.$loco[$i].$url_p2; } // FETCH URLS $rc = new RollingCurl("request_callback"); $rc->window_size = 20; foreach ($urls as $url) { $request = new Request($url); $rc->add($request); } $rc->execute(); ?> <?php /* Authored by Josh Fraser (www.joshfraser.com) Released under Apache License 2.0 Maintained by Alexander Makarov, http://rmcreative.ru/ $Id$ */ /** * Class that represent a single curl request */ class Request { public $url = false; public $method = 'GET'; public $post_data = null; public $headers = null; public $options = null; /** * @param string $url * @param string $method * @param $post_data * @param $headers * @param $options * @return void */ function __construct($url, $method = "GET", $post_data = null, $headers = null, $options = null) { $this->url = $url; $this->method = $method; $this->post_data = $post_data; $this->headers = $headers; $this->options = $options; } } /** * RollingCurl custom exception */ class RollingCurlException extends Exception {} /** * Class that holds a rolling queue of curl requests. * * @throws RollingCurlException */ class RollingCurl { /** * @var int * * Window size is the max number of simultaneous connections allowed. * * REMEMBER TO RESPECT THE SERVERS: * Sending too many requests at one time can easily be perceived * as a DOS attack. Increase this window_size if you are making requests * to multiple servers or have permission from the receving server admins. */ private $window_size = 5; /** * @var string|array * * Callback function to be applied to each result. */ private $callback; /** * @var array * * Set your base options that you want to be used with EVERY request. */ protected $options = array( CURLOPT_SSL_VERIFYPEER => 0, CURLOPT_RETURNTRANSFER => 1, CURLOPT_CONNECTTIMEOUT => 30, CURLOPT_TIMEOUT => 30 ); /** * @var array */ private $headers = array(); /** * @var Request[] * * The request queue */ private $requests = array(); /** * @param $callback * Callback function to be applied to each result. * * Can be specified as 'my_callback_function' * or array($object, 'my_callback_method'). * * Function should take two parameters: $response, $info. * $response is response body, $info is additional curl info. * * @return void */ function __construct($callback = null) { $this->callback = $callback; } /** * @param string $name * @return mixed */ public function __get($name) { return (isset($this->{$name})) ? $this->{$name} : null; } /** * @param string $name * @param mixed $value * @return bool */ public function __set($name, $value){ // append the base options & headers if ($name == "options" || $name == "headers") { $this->{$name} = $this->{$name} + $value; } else { $this->{$name} = $value; } return true; } /** * Add a request to the request queue * * @param Request $request * @return bool */ public function add($request) { $this->requests[] = $request; return true; } /** * Create new Request and add it to the request queue * * @param string $url * @param string $method * @param $post_data * @param $headers * @param $options * @return bool */ public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) { $this->requests[] = new Request($url, $method, $post_data, $headers, $options); return true; } /** * Perform GET request * * @param string $url * @param $headers * @param $options * @return bool */ public function get($url, $headers = null, $options = null) { return $this->request($url, "GET", null, $headers, $options); } /** * Perform POST request * * @param string $url * @param $post_data * @param $headers * @param $options * @return bool */ public function post($url, $post_data = null, $headers = null, $options = null) { return $this->request($url, "POST", $post_data, $headers, $options); } /** * Execute the curl * * @param int $window_size Max number of simultaneous connections * @return string|bool */ public function execute($window_size = null) { // rolling curl window must always be greater than 1 if (sizeof($this->requests) == 1) { return $this->single_curl(); } else { // start the rolling curl. window_size is the max number of simultaneous connections return $this->rolling_curl($window_size); } } /** * Performs a single curl request * * @access private * @return string */ private function single_curl() { $ch = curl_init(); $options = $this->get_options(array_shift($this->requests)); curl_setopt_array($ch,$options); $output = curl_exec($ch); $info = curl_getinfo($ch); // it's not neccesary to set a callback for one-off requests if ($this->callback) { $callback = $this->callback; if (is_callable($this->callback)){ call_user_func($callback, $output, $info); } } else return $output; } /** * Performs multiple curl requests * * @access private * @throws RollingCurlException * @param int $window_size Max number of simultaneous connections * @return bool */ private function rolling_curl($window_size = null) { if ($window_size) $this->window_size = $window_size; // make sure the rolling window isn't greater than the # of urls if (sizeof($this->requests) < $this->window_size) $this->window_size = sizeof($this->requests); if ($this->window_size < 2) { throw new RollingCurlException("Window size must be greater than 1"); } $master = curl_multi_init(); // start the first batch of requests for ($i = 0; $i < $this->window_size; $i++) { $ch = curl_init(); $options = $this->get_options($this->requests[$i]); curl_setopt_array($ch,$options); curl_multi_add_handle($master, $ch); } do { while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM); if($execrun != CURLM_OK) break; // a request was just completed -- find out which one while($done = curl_multi_info_read($master)) { // get the info and content returned on the request $info = curl_getinfo($done['handle']); $output = curl_multi_getcontent($done['handle']); // send the return values to the callback function. $callback = $this->callback; if (is_callable($callback)){ call_user_func($callback, $output, $info); } // start a new request (it's important to do this before removing the old one) if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) { $ch = curl_init(); $options = $this->get_options($this->requests[$i++]); curl_setopt_array($ch,$options); curl_multi_add_handle($master, $ch); } // remove the curl handle that just completed curl_multi_remove_handle($master, $done['handle']); } } while ($running); curl_multi_close($master); return true; } /** * Helper function to set up a new request by setting the appropriate options * * @access private * @param Request $request * @return array */ private function get_options($request) { // options for this entire curl object $options = $this->__get('options'); if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) { $options[CURLOPT_FOLLOWLOCATION] = 1; $options[CURLOPT_MAXREDIRS] = 5; } $headers = $this->__get('headers'); // append custom options for this specific request if ($request->options) { $options += $request->options; } // set the request URL $options[CURLOPT_URL] = $request->url; // posting data w/ this request? if ($request->post_data) { $options[CURLOPT_POST] = 1; $options[CURLOPT_POSTFIELDS] = $request->post_data; } if ($headers) { $options[CURLOPT_HEADER] = 0; $options[CURLOPT_HTTPHEADER] = $headers; } return $options; } /** * @return void */ public function __destruct() { unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests); } } ?> Quote Link to comment Share on other sites More sharing options...
gizmola Posted June 20, 2010 Share Posted June 20, 2010 Often sites do things like set cookies or perform redirects. Simulating a browser is non-trivial. This is one of the reasons that people often use the curl library which has a rich set of client features. Firebug is one tool that can help you explore what the site is actually doing in the case of each of the url's. Quote Link to comment Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.