Jump to content

Get data from URL.


strago

Recommended Posts

$new->url('hddp://www.domain.com/');

 

How do you let the browser URL pick it? I tried the simple

 

$url = $_GET['url'];

 

but it looks like that's not enough to be able to get it from

domain.com/file.php?url=http://www.domain.com/

 

and

 

$new->startURL($_GET['url']);

 

spits out...

 

Fatal error: Call to a member function on a non-object in /public_html/implementation.php on line 20

Link to comment
https://forums.phpfreaks.com/topic/193307-get-data-from-url/
Share on other sites

I am a very simple programmer and if I wanted to get a parameter from the url I would use

 

in the program extractabc.php

$x = $_GET['url']; // would return abc to me in the $x variable

if the url was http://www.example.com/extractabc.php?url=abc

 

Is that what you wanted?

Link to comment
https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1017843
Share on other sites

That was one of the codes I tried but it didn't do it. Down about half way in the script, editing a line to

 

$pageContent = $this->getContents($_GET['url']);

 

does it, BUT...it keeps getting it over and over and over and over and....

 

 

 

Edit: OK, how did this thread get 1,229 views in less than a day!!!

Link to comment
https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018302
Share on other sites

$_GET['url']

Should work perfectly fine.

 

Can you please post all of your code so that we can see what's happening. Also note that in the picture attached you can see the code, the URL with the query attached, and the output of the script.

 

[attachment deleted by admin]

Link to comment
https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018340
Share on other sites

About half way down there's the one line code that tries to do this.

 

<?php

//Doesn't do it. Code down below tries to.
$url = $_GET['url'];

$DB_USER =  'root';
$DB_PASSWORD = 'XXXXXXXXX';
$DB_HOST = 'localhost';
$DB_NAME = 'mailscraper';
$dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
mysql_select_db($DB_NAME) or $error = mysql_error();

$new = new scraper;
// Start Path can be empty, which will be extracted from the start URL
$new->setStartPath();
//$new->setStartPath('http://forums.digitalpoint.com');
//$new->startURL('http://www.cmgscc.com/mailscraper/emails.shtml');


$new->startScraping();


class scraper
{
    // URL that stores first URL to start
    var $startURL;
    
    // List of allowed page extensions
    var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv'
        ,'.avi','.mp3','.flash','.swf','.css');
    
    // Which URL to scrape
    var $useURL;
    
    // Start path, for links that are relative
    var $startPath;
    
    // Set start path
    function setStartPath($path = NULL){
        if($path != NULL)
        {
            $this->startPath = $path;
        } else {
            $temp = explode('/',$this->startURL);
            $this->startPath = $temp[0].'//'.$temp[2];
        }
    }
    
    // Add the start URL
    function startURL($theURL){
        // Set start URL
        $this->startURL = $theURL;
    }
    
    // Function to get URL contents
    function getContents($url)
    {
        $ch = curl_init(); // initialize curl handle
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_VERBOSE, 0);
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
        curl_setopt($ch, CURLOPT_AUTOREFERER, false);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
        curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
        curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
        curl_setopt($ch, CURLOPT_FAILONERROR, 1);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);// allow redirects
        curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
        curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
        curl_setopt($ch, CURLOPT_POST, 0); // set POST method
        $buffer = curl_exec($ch); // run the whole process
        curl_close($ch); 
        return $buffer;
    }
    
    // Actually do the URLS
    function startScraping()
    {
        // Get page content
  
  //This code does it...over and over and over and over and over and over!!!!!!!!    
//     $pageContent = $this->getContents($_GET['url']);  

      //Original
      $pageContent = $this->getContents($this->startURL);

//       echo '<BR>Scraping URL: '.$this->startURL.PHP_EOL;
        
        // Get list of all emails on page
        preg_match_all('/href="mailto:([^"]+)"/Umis',$pageContent,$results);

//Older one        
//preg_match_all('/([\w+.-]+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);

        // Add the email to the email list array
        $insertCount=0;
        foreach($results[1] as $curEmail)
        {
        //    $insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
            
                        //Use text file instead.
      define("EMAIL_FILE","/home/virtual/site8/fst/var/www/html/mailscraper/email.shtml");
    $emailEntry = "<A HREF=\"mailto:$curEmail\">E-Mail Me</a><BR>";
    {
        $emailFile = fopen(EMAIL_FILE,"a");
    }
    fwrite($emailFile,$emailEntry);
    fclose($emailFile);


            if($insert){$insertCount++;}
        echo <<< END
<A HREF="mailto:$curEmail">E-Mail Me</a><BR>
END;
        }
        
  //      echo 'Emails found: '.number_format($insertCount).PHP_EOL;
        
        // Mark the page done
        $insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')");
        
        // Get list of new page URLS is emails were found on previous page
        preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
        $currentList = $this->cleanListURLs($results[1]);
        
        $insertURLCount=0;
        // Add the list to the array
        foreach($currentList as $curURL)
        {
            $insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
            if($insert){$insertURLCount++;}
        }
        
  //      echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;

        $getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
        $remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1");
        
        // Get the new page ready
        $this->startURL = $getURL['urlname'];
        $this->setStartPath();
        
        // If no more pages, return
        if($this->startURL == NULL){ return;}
        // Clean vars
        unset($results,$pageContent);
        // If more pages, loop again
        $this->startScraping();
    }
    
    // Function to clean input URLS
    function cleanListURLs($linkList)
    {    
        foreach($linkList as $sub => $url)
        {
            // Check if only 1 character - there must exist at least / character
            if(strlen($url) <= 1){unset($linkList[$sub]);}
            // Check for any javascript
            if(eregi('javascript',$url)){unset($linkList[$sub]);}
            // Check for invalid extensions
            //str_replace($this->allowedExtensions,'',$url,$count);
            if($count > 0){ unset($linkList[$sub]);}
            // If URL starts with #, ignore
            if(substr($url,0,1) == '#'){unset($linkList[$sub]);}
            
            // If everything is OK and path is relative, add starting path
            if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
                $linkList[$sub] = $this->startPath.$url;
            }
        }
        
        $remove = mysql_query("DELETE FROM `finishedurls`");
        $optimize = mysql_query("OPTIMIZE TABLE  `emaillist` , `finishedurls` , `workingurls`");

        return $linkList;
    }
    
}
?>
</body>
</html>

Link to comment
https://forums.phpfreaks.com/topic/193307-get-data-from-url/#findComment-1018547
Share on other sites

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.