strago Posted February 25, 2010 Share Posted February 25, 2010 $new->url('hddp://www.domain.com/'); How do you let the browser URL pick it? I tried the simple $url = $_GET['url']; but it looks like that's not enough to be able to get it from domain.com/file.php?url=http://www.domain.com/ and $new->startURL($_GET['url']); spits out... Fatal error: Call to a member function on a non-object in /public_html/implementation.php on line 20 Link to comment https://forums.phpfreaks.com/topic/193307-get-data-from-url/ Share on other sites More sharing options...
ocpaul20 Posted February 25, 2010 Share Posted February 25, 2010 I am a very simple programmer and if I wanted to get a parameter from the url I would use in the program extractabc.php $x = $_GET['url']; // would return abc to me in the $x variable if the url was http://www.example.com/extractabc.php?url=abc Is that what you wanted? Link to comment https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1017843 Share on other sites More sharing options...
strago Posted February 26, 2010 Author Share Posted February 26, 2010 That was one of the codes I tried but it didn't do it. Down about half way in the script, editing a line to $pageContent = $this->getContents($_GET['url']); does it, BUT...it keeps getting it over and over and over and over and.... Edit: OK, how did this thread get 1,229 views in less than a day!!! Link to comment https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018302 Share on other sites More sharing options...
Alias Posted February 26, 2010 Share Posted February 26, 2010 $_GET['url'] Should work perfectly fine. Can you please post all of your code so that we can see what's happening. Also note that in the picture attached you can see the code, the URL with the query attached, and the output of the script. [attachment deleted by admin] Link to comment https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018340 Share on other sites More sharing options...
strago Posted February 26, 2010 Author Share Posted February 26, 2010 About half way down there's the one line code that tries to do this. <?php //Doesn't do it. Code down below tries to. $url = $_GET['url']; $DB_USER = 'root'; $DB_PASSWORD = 'XXXXXXXXX'; $DB_HOST = 'localhost'; $DB_NAME = 'mailscraper'; $dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error(); mysql_select_db($DB_NAME) or $error = mysql_error(); $new = new scraper; // Start Path can be empty, which will be extracted from the start URL $new->setStartPath(); //$new->setStartPath('http://forums.digitalpoint.com'); //$new->startURL('http://www.cmgscc.com/mailscraper/emails.shtml'); $new->startScraping(); class scraper { // URL that stores first URL to start var $startURL; // List of allowed page extensions var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv' ,'.avi','.mp3','.flash','.swf','.css'); // Which URL to scrape var $useURL; // Start path, for links that are relative var $startPath; // Set start path function setStartPath($path = NULL){ if($path != NULL) { $this->startPath = $path; } else { $temp = explode('/',$this->startURL); $this->startPath = $temp[0].'//'.$temp[2]; } } // Add the start URL function startURL($theURL){ // Set start URL $this->startURL = $theURL; } // Function to get URL contents function getContents($url) { $ch = curl_init(); // initialize curl handle curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_VERBOSE, 0); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)"); curl_setopt($ch, CURLOPT_AUTOREFERER, false); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7); curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL); curl_setopt($ch, CURLOPT_URL,$url); // set url to post to curl_setopt($ch, CURLOPT_FAILONERROR, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);// allow redirects curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s curl_setopt($ch, CURLOPT_POST, 0); // set POST method $buffer = curl_exec($ch); // run the whole process curl_close($ch); return $buffer; } // Actually do the URLS function startScraping() { // Get page content //This code does it...over and over and over and over and over and over!!!!!!!! // $pageContent = $this->getContents($_GET['url']); //Original $pageContent = $this->getContents($this->startURL); // echo '<BR>Scraping URL: '.$this->startURL.PHP_EOL; // Get list of all emails on page preg_match_all('/href="mailto:([^"]+)"/Umis',$pageContent,$results); //Older one //preg_match_all('/([\w+.-]+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results); // Add the email to the email list array $insertCount=0; foreach($results[1] as $curEmail) { // $insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')"); //Use text file instead. define("EMAIL_FILE","/home/virtual/site8/fst/var/www/html/mailscraper/email.shtml"); $emailEntry = "<A HREF=\"mailto:$curEmail\">E-Mail Me</a><BR>"; { $emailFile = fopen(EMAIL_FILE,"a"); } fwrite($emailFile,$emailEntry); fclose($emailFile); if($insert){$insertCount++;} echo <<< END <A HREF="mailto:$curEmail">E-Mail Me</a><BR> END; } // echo 'Emails found: '.number_format($insertCount).PHP_EOL; // Mark the page done $insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')"); // Get list of new page URLS is emails were found on previous page preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results); $currentList = $this->cleanListURLs($results[1]); $insertURLCount=0; // Add the list to the array foreach($currentList as $curURL) { $insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')"); if($insert){$insertURLCount++;} } // echo 'URLs found: '.number_format($insertURLCount).PHP_EOL; $getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1")); $remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1"); // Get the new page ready $this->startURL = $getURL['urlname']; $this->setStartPath(); // If no more pages, return if($this->startURL == NULL){ return;} // Clean vars unset($results,$pageContent); // If more pages, loop again $this->startScraping(); } // Function to clean input URLS function cleanListURLs($linkList) { foreach($linkList as $sub => $url) { // Check if only 1 character - there must exist at least / character if(strlen($url) <= 1){unset($linkList[$sub]);} // Check for any javascript if(eregi('javascript',$url)){unset($linkList[$sub]);} // Check for invalid extensions //str_replace($this->allowedExtensions,'',$url,$count); if($count > 0){ unset($linkList[$sub]);} // If URL starts with #, ignore if(substr($url,0,1) == '#'){unset($linkList[$sub]);} // If everything is OK and path is relative, add starting path if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){ $linkList[$sub] = $this->startPath.$url; } } $remove = mysql_query("DELETE FROM `finishedurls`"); $optimize = mysql_query("OPTIMIZE TABLE `emaillist` , `finishedurls` , `workingurls`"); return $linkList; } } ?> </body> </html> Link to comment https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018547 Share on other sites More sharing options...
strago Posted February 27, 2010 Author Share Posted February 27, 2010 Turns out $new->setStartPath(http://wwww.domain.com/); just needed to have the $url in it instead! $new->setStartPath($url); Link to comment https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018862 Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.