Jump to content

CURL Cookies Problem


Recommended Posts

i'm trying to direct users to the pdf for a particular journal but i'm always directed to an intermediary abstract page:

http://scholar.google.com/scholar?hl=en&lr=&q=10.1002%2Fasna.200610720&btnG=Search

 

it looks like they are using cookies and they also generate an internal id number for the document (which you wouldn't know until you actually visited the page).

 

so, i started working on a script (see below) to collect the cookie from the site, then go back with the cookie and collect the internal document id on the abstract page, and then get the pdf. it seems so complicated for such a simple task! well, whatever i am doing is not working. can anybody help me. i've inserted some of my code below. maybe someone else has some ideas???

 

I used this tool in Firefox called Live HTTP Headers to track the headers being sent while I was visiting this page and it seems that maybe I may not be getting the cookie that the site gives after redirection. I saw that there are some CURL_OPT settings to follow the url. Do I have to use something like this?

 

Thanks in advance!

 

$url = "http://doi.wiley.com/$doi";

 

if (!($fp = fopen($url,"r"))) { die("Could not open login URL"); }

$meta = stream_get_meta_data($fp);

for ($j = 0; isset($meta["wrapper_data"][$j]); $j++)

{

if (strstr(strtolower($meta["wrapper_data"][$j]), 'cookie'))

{

$cookie = $meta["wrapper_data"][$j];

break;

}

}

fclose($fp);

 

$curl = curl_init( );

$file = "$id.txt";

$txt = fopen($file, "w+");

 

$url = $url."?cookieSet=1";

 

curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");

curl_setopt($curl, CURLOPT_COOKIE, $cookie);

curl_setopt ($curl, CURLOPT_URL, $url);

curl_setopt($curl, CURLOPT_BINARYTRANSFER, 1);

curl_setopt($curl, CURLOPT_FILE, $txt);

curl_exec ($curl);

curl_close ($curl);

 

fclose($txt);

 

$html = file_get_contents("$id.txt");

$pdf = preg_replace("/fulltext\s*\/\s*(.*?)\s*\/\s*PDFSTART/","$1",$match[0]);

$url = "http://download.interscience.wiley.com/cgi-bin/fulltext?ID=".$pdf."&PLACEBO=IE.pdf&mode=pdf";

@unlink("$id.txt");

 

 

Link to comment
https://forums.phpfreaks.com/topic/53159-curl-cookies-problem/
Share on other sites

i tried doing this instead (with no luck of even getting to the abstract page to retrieve the document number):

 

$url = "http://doi.wiley.com/$doi";

 

$curl = curl_init();

curl_setopt($curl, CURLOPT_URL, $url);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");

curl_setopt($curl, CURLOPT_COOKIEJAR, "cookie.txt");

curl_exec($curl);

curl_close($curl);

 

$curl2 = curl_init();

$file = "$id.txt";

$txt = fopen($file, "w+");

 

curl_setopt($curl2, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($curl2, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)");

curl_setopt($curl2, CURLOPT_COOKIEFILE, "cookie.txt");

curl_setopt ($curl2, CURLOPT_URL, $url);

curl_setopt($curl2, CURLOPT_BINARYTRANSFER, 1);

curl_setopt($curl2, CURLOPT_FILE, $txt);

curl_exec ($curl2);

curl_close ($curl2);

fclose($txt);

 

$html = file_get_contents("$id.txt");

preg_match("/fulltext\s*\/.*?\/\s*PDFSTART/",$html,$match);

$pdf = preg_replace("/fulltext\s*\/\s*(.*?)\s*\/\s*PDFSTART/","$1",$match[0]);

$url = "http://download.interscience.wiley.com/cgi-bin/fulltext?ID=".$pdf."&PLACEBO=IE.pdf&mode=pdf";

@unlink("$id.txt");

 

Please help???

 

Link to comment
https://forums.phpfreaks.com/topic/53159-curl-cookies-problem/#findComment-263852
Share on other sites

I don't think I am even getting the correct cookie information from stream_get_meta_data. I'm seeing different cookie information when I use the Firefox plugin called Live HTTP Headers. Can somebody please help? I just want to go directly to the pdf instead of the abstract page for this particular journal (here is an example http://scholar.google.com/scholar?hl=en&lr=&q=10.1002%2Fasna.200610720&btnG=Search). Thanks!

Link to comment
https://forums.phpfreaks.com/topic/53159-curl-cookies-problem/#findComment-263963
Share on other sites

So I have new code below for trying to bypass an abstract page on Wiley's site to go to the PDF. It seems to work for the first time around but if I try to put it in a loop, it will fail when I move on to the next paper or DOI. I just realized that some of you out there probably can't get to the PDF... but maybe someone has access (a subscription) to this journal?

 

$url = "http://www3.interscience.wiley.com/cgi-bin/resolvedoi?DOI=$doi";

 

sleep(90);

 

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_USERAGENT, "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");

curl_setopt($ch, CURLOPT_REFERER, "http://www3.interscience.wiley.com/cgi-bin/resolvedoi?DOI=$doi");

curl_setopt($ch, CURLOPT_VERBOSE, 0);

curl_setopt($ch, CURLOPT_HEADER, 1);

$cookie_file = fopen('an_cookie.txt', "w+");

curl_setopt ($ch, CURLOPT_COOKIEJAR, 'an_cookie.txt');

fclose($cookie_file);

$content = curl_exec ($ch); # This returns HTML

curl_close ($ch);

 

preg_match("/fulltext\s*\/.*?\/\s*PDFSTART/",$content,$match);

$pdf = preg_replace("/fulltext\s*\/\s*(.*?)\s*\/\s*PDFSTART/","$1",$match[0]);

$url = "http://download.interscience.wiley.com/cgi-bin/fulltext?ID=".$pdf."&PLACEBO=IE.pdf&mode=pdf";

 

$curl = curl_init();

curl_setopt($curl, CURLOPT_URL, $url);

curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);

curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($curl, CURLOPT_USERAGENT, "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3");

curl_setopt($curl, CURLOPT_REFERER, "http://www3.interscience.wiley.com/cgi-bin/fulltext/$pdf/PDFSTART");

curl_setopt($curl, CURLOPT_VERBOSE, 0);

curl_setopt($curl, CURLOPT_COOKIEFILE, 'an_cookie.txt');

curl_setopt($curl, CURLOPT_BINARYTRANSFER, 1);

$doc = curl_exec ($curl); # This returns PDF

curl_close ($curl);

 

 

$file = "$id.pdf";

$file_pdf = fopen($file, "wb");

fwrite($file_pdf,$doc);

fclose($file_pdf);

Link to comment
https://forums.phpfreaks.com/topic/53159-curl-cookies-problem/#findComment-264697
Share on other sites

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.