Jump to content

[SOLVED] WTF?! Variable value disapearing?


hellonoko

Recommended Posts

I have a simple site scraper that uses CURL.

 

I was just adding some code to it that compares certain links to the base site address and then completes them if necessary. For example if it scans:

 

index.php it makes that link into http://www.site.com/login.php

 

My problem is that $site_url (line 3) at the very top of the page echos correctly.

 

But then when I get down to line 91 where I have put in a echo to test it. Nothing. Empty.

 

These are the only two times it is called. Where is it going? I even tried adding a echo at the very bottom end of my code to make sure it wasn't just me using it in the wrong place or something.

 

Any ideas? Thanks.

 

<?php

echo $site_url = 'http://www.empreintes-digitales.fr<br><br>';
$target_url = 'http://www.empreintes-digitales.fr/index.php?post=794';

//$target_url = 'http://redthreat.wordpress.com/';
//$target_url= 'http://www.kissatlanta.com/blog/';
//$target_url= 'http://www.empreintes-digitales.fr/';

$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';

crawl_page( $target_url, $userAgent);

function crawl_page( $target_url, $userAgent)
{
	$ch = curl_init();

	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
	curl_setopt($ch, CURLOPT_URL,$target_url);
	curl_setopt($ch, CURLOPT_FAILONERROR, true);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
	curl_setopt($ch, CURLOPT_AUTOREFERER, true);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
	curl_setopt($ch, CURLOPT_TIMEOUT, 10);

	$html = curl_exec($ch);

	if (!$html) 
	{
		echo "<br />cURL error number:" .curl_errno($ch);
		echo "<br />cURL error:" . curl_error($ch);
		exit;
	}

	//
	// load scrapped data into the DOM
	//

	$dom = new DOMDocument();
	@$dom->loadHTML($html);

	//
	// get only LINKS from the DOM with XPath
	//

	$xpath = new DOMXPath($dom);
	$hrefs = $xpath->evaluate("/html/body//a");

	//
	// go through all the links and store to db or whatever
	//
	for ($i = 0; $i < $hrefs->length; $i++) 
	{
		$href = $hrefs->item($i);
		$url = $href->getAttribute('href');

		$links_1[$link] = $url;

		echo $url;
		echo '<br>';
	}



	for ($i = 0; $i < $hrefs->length; $i++) 
	{
		$href = $hrefs->item($i);
		$url = $href->getAttribute('href');

		$find = ".mp3";
		$pos = strpos($url, $find);

		if ($pos === false) 
		{
   			
		} 
		else 
		{
   			// function to store to db
			//storeLink($url,$target_url);
			echo 'File: ';
			echo $url;
			echo '<br>';

			$last_slash = strripos( $url ,"/");

			$clean_file_name = substr( $url , $last_slash + 1 , strlen($url) );

			//fixes the url if it does not have a FULL address

			echo $site_url;

			if ( strstr( $url , $base_url) !=  TRUE )
			{
				echo '<b>BROKEN URL</b><br>';
				echo $base_url;
				$url = $base_url . $url;
				echo 'FIXED URL: '.$url.'<BR>';

			}
			exit();

			echo 'From: ';
			echo $target_url;
			echo '<br>';

			//directory to copy to (must be CHMOD to 777)
			$copydir = "/home2/sharingi/public_html/scrape/scraped/";

			$data = file_get_contents($url);
			$file = fopen($copydir . $clean_file_name, "w+");
			fputs($file, $data);
			fclose($file);

			//$savefile="tempimg/".time().".jpg";


			//$ch = curl_init ($copydir);
			//$fp = fopen ($copydir . $clean_file_name, "w+");
			//curl_setopt ($ch, CURLOPT_FILE, $fp);
			//curl_setopt ($ch, CURLOPT_HEADER, 0);
			//curl_exec ($ch);
			//curl_close ($ch);
			//fclose ($fp);

			//echo "Coppied!";
			//echo "<br><br>";
		}	
	}

}

?>

 

 

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.