Jump to content

strago

Members
  • Posts

    93
  • Joined

  • Last visited

Posts posted by strago

  1. About half way down there's the one line code that tries to do this.

     

    <?php
    
    //Doesn't do it. Code down below tries to.
    $url = $_GET['url'];
    
    $DB_USER =  'root';
    $DB_PASSWORD = 'XXXXXXXXX';
    $DB_HOST = 'localhost';
    $DB_NAME = 'mailscraper';
    $dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
    mysql_select_db($DB_NAME) or $error = mysql_error();
    
    $new = new scraper;
    // Start Path can be empty, which will be extracted from the start URL
    $new->setStartPath();
    //$new->setStartPath('http://forums.digitalpoint.com');
    //$new->startURL('http://www.cmgscc.com/mailscraper/emails.shtml');
    
    
    $new->startScraping();
    
    
    class scraper
    {
        // URL that stores first URL to start
        var $startURL;
        
        // List of allowed page extensions
        var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv'
            ,'.avi','.mp3','.flash','.swf','.css');
        
        // Which URL to scrape
        var $useURL;
        
        // Start path, for links that are relative
        var $startPath;
        
        // Set start path
        function setStartPath($path = NULL){
            if($path != NULL)
            {
                $this->startPath = $path;
            } else {
                $temp = explode('/',$this->startURL);
                $this->startPath = $temp[0].'//'.$temp[2];
            }
        }
        
        // Add the start URL
        function startURL($theURL){
            // Set start URL
            $this->startURL = $theURL;
        }
        
        // Function to get URL contents
        function getContents($url)
        {
            $ch = curl_init(); // initialize curl handle
            curl_setopt($ch, CURLOPT_HEADER, 0);
            curl_setopt($ch, CURLOPT_VERBOSE, 0);
            curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
            curl_setopt($ch, CURLOPT_AUTOREFERER, false);
            curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
            curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
            curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
            curl_setopt($ch, CURLOPT_FAILONERROR, 1);
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);// allow redirects
            curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
            curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
            curl_setopt($ch, CURLOPT_POST, 0); // set POST method
            $buffer = curl_exec($ch); // run the whole process
            curl_close($ch); 
            return $buffer;
        }
        
        // Actually do the URLS
        function startScraping()
        {
            // Get page content
      
      //This code does it...over and over and over and over and over and over!!!!!!!!    
    //     $pageContent = $this->getContents($_GET['url']);  
    
          //Original
          $pageContent = $this->getContents($this->startURL);
    
    //       echo '<BR>Scraping URL: '.$this->startURL.PHP_EOL;
            
            // Get list of all emails on page
            preg_match_all('/href="mailto:([^"]+)"/Umis',$pageContent,$results);
    
    //Older one        
    //preg_match_all('/([\w+.-]+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);
    
            // Add the email to the email list array
            $insertCount=0;
            foreach($results[1] as $curEmail)
            {
            //    $insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
                
                            //Use text file instead.
          define("EMAIL_FILE","/home/virtual/site8/fst/var/www/html/mailscraper/email.shtml");
        $emailEntry = "<A HREF=\"mailto:$curEmail\">E-Mail Me</a><BR>";
        {
            $emailFile = fopen(EMAIL_FILE,"a");
        }
        fwrite($emailFile,$emailEntry);
        fclose($emailFile);
    
    
                if($insert){$insertCount++;}
            echo <<< END
    <A HREF="mailto:$curEmail">E-Mail Me</a><BR>
    END;
            }
            
      //      echo 'Emails found: '.number_format($insertCount).PHP_EOL;
            
            // Mark the page done
            $insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')");
            
            // Get list of new page URLS is emails were found on previous page
            preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
            $currentList = $this->cleanListURLs($results[1]);
            
            $insertURLCount=0;
            // Add the list to the array
            foreach($currentList as $curURL)
            {
                $insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
                if($insert){$insertURLCount++;}
            }
            
      //      echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;
    
            $getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
            $remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1");
            
            // Get the new page ready
            $this->startURL = $getURL['urlname'];
            $this->setStartPath();
            
            // If no more pages, return
            if($this->startURL == NULL){ return;}
            // Clean vars
            unset($results,$pageContent);
            // If more pages, loop again
            $this->startScraping();
        }
        
        // Function to clean input URLS
        function cleanListURLs($linkList)
        {    
            foreach($linkList as $sub => $url)
            {
                // Check if only 1 character - there must exist at least / character
                if(strlen($url) <= 1){unset($linkList[$sub]);}
                // Check for any javascript
                if(eregi('javascript',$url)){unset($linkList[$sub]);}
                // Check for invalid extensions
                //str_replace($this->allowedExtensions,'',$url,$count);
                if($count > 0){ unset($linkList[$sub]);}
                // If URL starts with #, ignore
                if(substr($url,0,1) == '#'){unset($linkList[$sub]);}
                
                // If everything is OK and path is relative, add starting path
                if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
                    $linkList[$sub] = $this->startPath.$url;
                }
            }
            
            $remove = mysql_query("DELETE FROM `finishedurls`");
            $optimize = mysql_query("OPTIMIZE TABLE  `emaillist` , `finishedurls` , `workingurls`");
    
            return $linkList;
        }
        
    }
    ?>
    </body>
    </html>

  2. That was one of the codes I tried but it didn't do it. Down about half way in the script, editing a line to

     

    $pageContent = $this->getContents($_GET['url']);

     

    does it, BUT...it keeps getting it over and over and over and over and....

     

     

     

    Edit: OK, how did this thread get 1,229 views in less than a day!!!

  3. $new->url('hddp://www.domain.com/');

     

    How do you let the browser URL pick it? I tried the simple

     

    $url = $_GET['url'];

     

    but it looks like that's not enough to be able to get it from

    domain.com/file.php?url=http://www.domain.com/

     

    and

     

    $new->startURL($_GET['url']);

     

    spits out...

     

    Fatal error: Call to a member function on a non-object in /public_html/implementation.php on line 20

  4. What's the code to translate

     

    $insertCount=0;
            foreach($results[1] as $content)
    $insert = mysql_query("INSERT INTO `table` (`name`) VALUES ('$content')");
           {
                if($insert){$insertCount++;}
            }

     

    into puting it in a text file? I try

     

            $insertCount=0;
            foreach($results[1] as $curEmail)
            
            define("DATABASE_FILE","/public_html/file.txt");
        $databaseEntry = "$content\r\n\n";
        {
            $databaseFile = fopen(DATABASE_FILE,"a");
        }
        fwrite($databaseFile,$databaseEntry);
        fclose($databaseFile);
               {
                if($insert){$insertCount++;}
            }

     

    and it does put the content in the file....over and over and over and over.....

     

    I can't figure out where $insert goes. I think the counting is messing it all up.

  5. And that just spits out an error.

     

    preg_match('/~([\w+.-]+)@~[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);

     

     

    Warning: Invalid argument supplied for foreach() in /public_html/file.php on line 90

    Warning: preg_match() [function.preg-match]: Compilation failed: unmatched parentheses at offset 39 in /public_html/file.php on line 79

     

    While

     

    preg_match('/([\w+.-]+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);

     

    does it.

  6. Thanks. Luckly I'm on a dedicated server so I won't have to worry about getting shut down if it get's too slow!

     

    <?
    $textfile ="random.txt";
    $items = file("$textfile");
    $item = rand(0, sizeof($items)-1);
    echo $items[$item];
    ?>
    

     

    is the code. I think it looks through every line, THEN selects a random line!! Hence the file size worry!! Is it possible to make it any more efficient? Each line is

     

    <center><object width=500 height=400><param name=movie value=http://www.youtube.com/watch?v=$v&rel=0></param><param name=wmode value=transparent></param><embed src=http://www.youtube.com/watch?v=$v&rel=0 type=application/x-shockwave-flash wmode=transparent width=300 height=250></embed></object><BR><A HREF=http://www.tube-download.info/v/$v/>Download Video</a></form></center><HR>

     

    is there any way to make each line just be something like...

     

    XX1XX$vXX2XX$vXX3XX$vXX4XX

     

    and in the script do a search and replace to add the HTML?

  7. define("RANDOM_FILE","/public_html/random.txt");
        $randomEntry = "This goes in text file.\n";
        {
            $randomFile = fopen(RANDOM_FILE,"a");
        }
        fwrite($randomFile,$randomEntry);
        fclose($randomFile);
    

     

    This adds text to a random.txt file. How do you make it also empty the file when there is over say 1,000 lines of data?

     

    Each line has 588 bytes of data, or 588K when it has 1,000 lines. How big does a text file need to be before it can slow the site down when the script looks for the random line?

  8. Thanks

     

    array_pop($lines);
    $file = join('',$lines);

     

    did it.

     

    $lines = file('/public_html/downloads.txt');
    array_pop($lines);
    $file = join('',$lines);
    $fopen = fopen("/public_html/downloads.txt", "w+");
    fwrite( $fopen, "This goes in file as top line.\r\n");
    foreach ($lines as $line)
    {
    fwrite( $fopen, "$line");
    }
    fclose($fopen);

     

    I had tried array_pop but I think I had it in the wrong spot.

  9. $lines = file('/public_html/downloads.txt');
    $fopen = fopen("/public_html/downloads.txt", "w+");
    fwrite( $fopen, "This goes in file as top line.\r\n");
    foreach ($lines as $line)
    {
    fwrite( $fopen, "$line");
    }
    fclose($fopen);

     

    This adds data to a text file, as the first line. How do I make it then delete the bottom last line in the file?

  10.   $download_link = $tube->get('http://www.youtube.com/watch?v='.$_GET['v']);
      $download_link2 = $tube->get($_GET['v']);
    
    if($download_link); {
    echo <<< END
    <BR>2. Right click <a href="$download_link">this link</a>.
    END;
    }
    if(!($download_link2)); {
    echo <<< END
    <BR>2. Right click <a href="$download_link2">this link</a>.
    END;
    }
    elseif {
    echo <<< END
        <P>Error locating download URL.
    END;
    }
    echo <<< END
      <center><H1>Last 20 Downloads</H1></center>
    END;
      include('downloads.html');
      }
    ?>
    

     

    The

    elseif {
    echo <<< END
        <P>Error locating download URL.
    END;
    }
    

     

    part is spiting out

     

    Parse error: syntax error, unexpected T_ELSEIF in /public_html/index.php  on line 59

     

     

    If some one enters for example '9lp0IWv8QZY' in the form, only the first if should show up. If they enter hxxp://www.youtube.com/watch?v=9lp0IWv8QZY in the form, then just the second if should show up.

     

    Then if nether one of those works, then the ELSEIF error needs to come up.

  11. <?php
    $v = $_GET['v'];
    if(isset($_GET['v']))
    
    include('http://www.domain.com/video.php?video='.$_GET['v']);
    
    if(isset($_GET['v']))
    {
    echo <<< END
    
    If domain.com/index.php?v=whatever
    <P>
    post this.
    
    END;
    }
    else;{
    
    echo <<< END
    <HR>
    If domain.com/index.php
    <P>
    Post this.
    
    END;
    }
    ?>

     

    If you go to

     

    domain.com/index.php

     

    it works just fine. But, if you go to

     

    domain.com/index.php?v=whatever

     

    you get the stuff for that page, and then the code for domain.com/index.php. How do you get it to just post the code for that page and not include the

     

    domain.com/index.php

     

    info?

  12. define("DOWNLOADS_FILE","/public_html/downloads.html");
    
    $downloadsEntry = "<center><A HREF=http://www.domain.com/v/$v/>$v</a><HR></center>\n";
    
    $lines = file(DOWNLOADS_FILE);
    $lines[] = $downloadsEntry;
    $last40 = array_slice($lines, -40);
    file_put_contents(DOWNLOADS_FILE, implode("", $last40));
        {
            $downloadsFile = fopen(DOWNLOADS_FILE,"a");
        }
    //    fwrite($downloadsFile,$downloadsEntry);
        fclose($downloadsFile);
    

     

    This logs the last 20 downloads. The newest download is added at the bottom of the file. How do you change it so the newest log is added at the top of the file, so it's listed first using

     

    echo <<< END
      <H1>Last 20 Downloads</H1>
    END;
      include('downloads.html');
    

     

    And how do you make it so '20' actually means '20'? Right now you have to have '40' to get '20' logs.

  13. Two scripts can't have the same URLs. You need to give them something unique in the URL, like

     

    RewriteRule ^page/([a-zA-Z0-9_-]+)/page=([0-9_-]*)$ /pages.php?cat_id=$1&currentpage=$2 [L]

    RewriteRule ^index/([a-zA-Z0-9_-]+)/page=([0-9_-]+)$ /index.php?order=$1&currentpage=$2 [L]

     

    RewriteRule ^page/([a-zA-Z0-9_-]+)$ /pages.php?cat_id=$1 [L]

    RewriteRule ^page/([a-zA-Z0-9_-]+)$ /index.php?order=$1 [L]

     

    and I would suggest also changing

     

    /page=

     

    to

     

    /page/

  14. $download_link = $tube->get('http://www.youtube.com/watch/v/' $_GET['url']);

     

    Depending on if I use " or ' it spits out

     

    Parse error: syntax error, unexpected T_VARIABLE in /public_html/index.php on line 56

    Parse error: syntax error, unexpected ':' in /public_html/index.php on line 56

     

     

     

    $download_link = $tube->get($_GET['url']);

     

    works if I wanted the new form URL to be like

     

    hxxp://www.domain.com/index.php?url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D9lp0IWv8QZY

     

    though I wanna keep it short with

     

    hxxp://www.domain.com/index.php?url=9lp0IWv8QZY

     

    by having most of the URL in the script so I can do mod_rewrite.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.