strago

February 27, 2010

Turns out

$new->setStartPath(http://wwww.domain.com/);

just needed to have the $url in it instead!

$new->setStartPath($url);

February 26, 2010

About half way down there's the one line code that tries to do this.

<?php

//Doesn't do it. Code down below tries to.
$url = $_GET['url'];

$DB_USER =  'root';
$DB_PASSWORD = 'XXXXXXXXX';
$DB_HOST = 'localhost';
$DB_NAME = 'mailscraper';
$dbc = mysql_connect ($DB_HOST, $DB_USER, $DB_PASSWORD) or $error = mysql_error();
mysql_select_db($DB_NAME) or $error = mysql_error();

$new = new scraper;
// Start Path can be empty, which will be extracted from the start URL
$new->setStartPath();
//$new->setStartPath('http://forums.digitalpoint.com');
//$new->startURL('http://www.cmgscc.com/mailscraper/emails.shtml');


$new->startScraping();


class scraper
{
    // URL that stores first URL to start
    var $startURL;
    
    // List of allowed page extensions
    var $allowedExtensions = array('.css','.xml','.rss','.ico','.js','.gif','.jpg','.jpeg','.png','.bmp','.wmv'
        ,'.avi','.mp3','.flash','.swf','.css');
    
    // Which URL to scrape
    var $useURL;
    
    // Start path, for links that are relative
    var $startPath;
    
    // Set start path
    function setStartPath($path = NULL){
        if($path != NULL)
        {
            $this->startPath = $path;
        } else {
            $temp = explode('/',$this->startURL);
            $this->startPath = $temp[0].'//'.$temp[2];
        }
    }
    
    // Add the start URL
    function startURL($theURL){
        // Set start URL
        $this->startURL = $theURL;
    }
    
    // Function to get URL contents
    function getContents($url)
    {
        $ch = curl_init(); // initialize curl handle
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_VERBOSE, 0);
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible;)");
        curl_setopt($ch, CURLOPT_AUTOREFERER, false);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,7);
        curl_setopt($ch, CURLOPT_REFERER, 'http://'.$this->useURL);
        curl_setopt($ch, CURLOPT_URL,$url); // set url to post to
        curl_setopt($ch, CURLOPT_FAILONERROR, 1);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);// allow redirects
        curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
        curl_setopt($ch, CURLOPT_TIMEOUT, 50); // times out after 50s
        curl_setopt($ch, CURLOPT_POST, 0); // set POST method
        $buffer = curl_exec($ch); // run the whole process
        curl_close($ch); 
        return $buffer;
    }
    
    // Actually do the URLS
    function startScraping()
    {
        // Get page content
  
  //This code does it...over and over and over and over and over and over!!!!!!!!    
//     $pageContent = $this->getContents($_GET['url']);  

      //Original
      $pageContent = $this->getContents($this->startURL);

//       echo '<BR>Scraping URL: '.$this->startURL.PHP_EOL;
        
        // Get list of all emails on page
        preg_match_all('/href="mailto:([^"]+)"/Umis',$pageContent,$results);

//Older one        
//preg_match_all('/([\w+.-]+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);

        // Add the email to the email list array
        $insertCount=0;
        foreach($results[1] as $curEmail)
        {
        //    $insert = mysql_query("INSERT INTO `emaillist` (`emailadd`) VALUES ('$curEmail')");
            
                        //Use text file instead.
      define("EMAIL_FILE","/home/virtual/site8/fst/var/www/html/mailscraper/email.shtml");
    $emailEntry = "<A HREF=\"mailto:$curEmail\">E-Mail Me</a><BR>";
    {
        $emailFile = fopen(EMAIL_FILE,"a");
    }
    fwrite($emailFile,$emailEntry);
    fclose($emailFile);


            if($insert){$insertCount++;}
        echo <<< END
<A HREF="mailto:$curEmail">E-Mail Me</a><BR>
END;
        }
        
  //      echo 'Emails found: '.number_format($insertCount).PHP_EOL;
        
        // Mark the page done
        $insert = mysql_query("INSERT INTO `finishedurls` (`urlname`) VALUES ('".$this->startURL."')");
        
        // Get list of new page URLS is emails were found on previous page
        preg_match_all('/href="([^"]+)"/Umis',$pageContent,$results);
        $currentList = $this->cleanListURLs($results[1]);
        
        $insertURLCount=0;
        // Add the list to the array
        foreach($currentList as $curURL)
        {
            $insert = mysql_query("INSERT INTO `workingurls` (`urlname`) VALUES ('$curURL')");
            if($insert){$insertURLCount++;}
        }
        
  //      echo 'URLs found: '.number_format($insertURLCount).PHP_EOL;

        $getURL = mysql_fetch_assoc(mysql_query("SELECT `urlname` FROM `workingurls` ORDER BY RAND() LIMIT 1"));
        $remove = mysql_query("DELETE FROM `workingurls` WHERE `urlname`='$getURL[urlname]' LIMIT 1");
        
        // Get the new page ready
        $this->startURL = $getURL['urlname'];
        $this->setStartPath();
        
        // If no more pages, return
        if($this->startURL == NULL){ return;}
        // Clean vars
        unset($results,$pageContent);
        // If more pages, loop again
        $this->startScraping();
    }
    
    // Function to clean input URLS
    function cleanListURLs($linkList)
    {    
        foreach($linkList as $sub => $url)
        {
            // Check if only 1 character - there must exist at least / character
            if(strlen($url) <= 1){unset($linkList[$sub]);}
            // Check for any javascript
            if(eregi('javascript',$url)){unset($linkList[$sub]);}
            // Check for invalid extensions
            //str_replace($this->allowedExtensions,'',$url,$count);
            if($count > 0){ unset($linkList[$sub]);}
            // If URL starts with #, ignore
            if(substr($url,0,1) == '#'){unset($linkList[$sub]);}
            
            // If everything is OK and path is relative, add starting path
            if(substr($url,0,1) == '/' || substr($url,0,1) == '?' || substr($url,0,1) == '='){
                $linkList[$sub] = $this->startPath.$url;
            }
        }
        
        $remove = mysql_query("DELETE FROM `finishedurls`");
        $optimize = mysql_query("OPTIMIZE TABLE  `emaillist` , `finishedurls` , `workingurls`");

        return $linkList;
    }
    
}
?>
</body>
</html>

February 26, 2010

That was one of the codes I tried but it didn't do it. Down about half way in the script, editing a line to

$pageContent = $this->getContents($_GET['url']);

does it, BUT...it keeps getting it over and over and over and over and....

Edit: OK, how did this thread get 1,229 views in less than a day!!!

February 25, 2010

$new->url('hddp://www.domain.com/');

How do you let the browser URL pick it? I tried the simple

$url = $_GET['url'];

but it looks like that's not enough to be able to get it from

domain.com/file.php?url=http://www.domain.com/

and

$new->startURL($_GET['url']);

spits out...

Fatal error: Call to a member function on a non-object in /public_html/implementation.php on line 20

February 25, 2010

What's the code to translate

$insertCount=0;
        foreach($results[1] as $content)
$insert = mysql_query("INSERT INTO `table` (`name`) VALUES ('$content')");
       {
            if($insert){$insertCount++;}
        }

into puting it in a text file? I try

        $insertCount=0;
        foreach($results[1] as $curEmail)
        
        define("DATABASE_FILE","/public_html/file.txt");
    $databaseEntry = "$content\r\n\n";
    {
        $databaseFile = fopen(DATABASE_FILE,"a");
    }
    fwrite($databaseFile,$databaseEntry);
    fclose($databaseFile);
           {
            if($insert){$insertCount++;}
        }

and it does put the content in the file....over and over and over and over.....

I can't figure out where $insert goes. I think the counting is messing it all up.

February 24, 2010

And that just spits out an error.

preg_match('/~([\w+.-]+)@~[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);

Warning: Invalid argument supplied for foreach() in /public_html/file.php on line 90

Warning: preg_match() [function.preg-match]: Compilation failed: unmatched parentheses at offset 39 in /public_html/file.php on line 79

While

preg_match('/([\w+.-]+@[\w+\.]*\w+[\w+\-\w+]*\.\w+)/is',$pageContent,$results);

does it.

February 24, 2010

preg_match('/\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/is',$pageContent,$results);

checks for xxx@ domain.com. What do you change the code to for checking for

xxx-xxx-xxx@ domain.com

February 21, 2010

Thanks. Luckly I'm on a dedicated server so I won't have to worry about getting shut down if it get's too slow!

<?
$textfile ="random.txt";
$items = file("$textfile");
$item = rand(0, sizeof($items)-1);
echo $items[$item];
?>

is the code. I think it looks through every line, THEN selects a random line!! Hence the file size worry!! Is it possible to make it any more efficient? Each line is

<center><object width=500 height=400><param name=movie value=http://www.youtube.com/watch?v=$v&rel=0></param><param name=wmode value=transparent></param><embed src=http://www.youtube.com/watch?v=$v&rel=0 type=application/x-shockwave-flash wmode=transparent width=300 height=250></embed></object><BR><A HREF=http://www.tube-download.info/v/$v/>Download Video</a></form></center><HR>

is there any way to make each line just be something like...

XX1XX$vXX2XX$vXX3XX$vXX4XX

and in the script do a search and replace to add the HTML?

February 18, 2010

define("RANDOM_FILE","/public_html/random.txt");
    $randomEntry = "This goes in text file.\n";
    {
        $randomFile = fopen(RANDOM_FILE,"a");
    }
    fwrite($randomFile,$randomEntry);
    fclose($randomFile);

This adds text to a random.txt file. How do you make it also empty the file when there is over say 1,000 lines of data?

Each line has 588 bytes of data, or 588K when it has 1,000 lines. How big does a text file need to be before it can slow the site down when the script looks for the random line?

February 18, 2010

Thanks

array_pop($lines);
$file = join('',$lines);

did it.

$lines = file('/public_html/downloads.txt');
array_pop($lines);
$file = join('',$lines);
$fopen = fopen("/public_html/downloads.txt", "w+");
fwrite( $fopen, "This goes in file as top line.\r\n");
foreach ($lines as $line)
{
fwrite( $fopen, "$line");
}
fclose($fopen);

I had tried array_pop but I think I had it in the wrong spot.

February 18, 2010

$lines = file('/public_html/downloads.txt');
$fopen = fopen("/public_html/downloads.txt", "w+");
fwrite( $fopen, "This goes in file as top line.\r\n");
foreach ($lines as $line)
{
fwrite( $fopen, "$line");
}
fclose($fopen);

This adds data to a text file, as the first line. How do I make it then delete the bottom last line in the file?

February 18, 2010

Thanks. I'm actually the epitome of a php n00bie. (Think of a third grader trying to figure out algebra!) I just wanted it working and didn't care how I could get it to work!

When I found these forums, it was like I found heaven!!

February 18, 2010

  $download_link = $tube->get('http://www.youtube.com/watch?v='.$_GET['v']);
  $download_link2 = $tube->get($_GET['v']);

if($download_link); {
echo <<< END
<BR>2. Right click <a href="$download_link">this link</a>.
END;
}
if(!($download_link2)); {
echo <<< END
<BR>2. Right click <a href="$download_link2">this link</a>.
END;
}
elseif {
echo <<< END
    <P>Error locating download URL.
END;
}
echo <<< END
  <center><H1>Last 20 Downloads</H1></center>
END;
  include('downloads.html');
  }
?>

The

elseif {
echo <<< END
    <P>Error locating download URL.
END;
}

part is spiting out

Parse error: syntax error, unexpected T_ELSEIF in /public_html/index.php on line 59

If some one enters for example '9lp0IWv8QZY' in the form, only the first if should show up. If they enter hxxp://www.youtube.com/watch?v=9lp0IWv8QZY in the form, then just the second if should show up.

Then if nether one of those works, then the ELSEIF error needs to come up.

February 17, 2010

<?php
$v = $_GET['v'];
if(isset($_GET['v']))

include('http://www.domain.com/video.php?video='.$_GET['v']);

if(isset($_GET['v']))
{
echo <<< END

If domain.com/index.php?v=whatever
<P>
post this.

END;
}
else;{

echo <<< END
<HR>
If domain.com/index.php
<P>
Post this.

END;
}
?>

If you go to

domain.com/index.php

it works just fine. But, if you go to

domain.com/index.php?v=whatever

you get the stuff for that page, and then the code for domain.com/index.php. How do you get it to just post the code for that page and not include the

domain.com/index.php

info?

February 17, 2010

define("DOWNLOADS_FILE","/public_html/downloads.html");

$downloadsEntry = "<center><A HREF=http://www.domain.com/v/$v/>$v</a><HR></center>\n";

$lines = file(DOWNLOADS_FILE);
$lines[] = $downloadsEntry;
$last40 = array_slice($lines, -40);
file_put_contents(DOWNLOADS_FILE, implode("", $last40));
    {
        $downloadsFile = fopen(DOWNLOADS_FILE,"a");
    }
//    fwrite($downloadsFile,$downloadsEntry);
    fclose($downloadsFile);

This logs the last 20 downloads. The newest download is added at the bottom of the file. How do you change it so the newest log is added at the top of the file, so it's listed first using

echo <<< END
  <H1>Last 20 Downloads</H1>
END;
  include('downloads.html');

And how do you make it so '20' actually means '20'? Right now you have to have '40' to get '20' logs.

February 16, 2010

Thanks. That did it.

Is there any way to make it so that if you enter the full URL in the form, the part of the URL before the ID get's deleted?

$v = str_replace($v, "", "http://www.youtube.com/watch?v=");

$download_link = $tube->get('http://www.youtube.com/watch?v='.$_GET['v']);

doesn't take it out.

February 16, 2010

Two scripts can't have the same URLs. You need to give them something unique in the URL, like

RewriteRule ^page/([a-zA-Z0-9_-]+)/page=([0-9_-]*)$ /pages.php?cat_id=$1&currentpage=$2 [L]

RewriteRule ^index/([a-zA-Z0-9_-]+)/page=([0-9_-]+)$ /index.php?order=$1&currentpage=$2 [L]

RewriteRule ^page/([a-zA-Z0-9_-]+)$ /pages.php?cat_id=$1 [L]

RewriteRule ^page/([a-zA-Z0-9_-]+)$ /index.php?order=$1 [L]

and I would suggest also changing

/page=

to

/page/

February 16, 2010

$download_link = $tube->get('http://www.youtube.com/watch/v/' $_GET['url']);

Depending on if I use " or ' it spits out

Parse error: syntax error, unexpected T_VARIABLE in /public_html/index.php on line 56

Parse error: syntax error, unexpected ':' in /public_html/index.php on line 56

$download_link = $tube->get($_GET['url']);

works if I wanted the new form URL to be like

hxxp://www.domain.com/index.php?url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D9lp0IWv8QZY

though I wanna keep it short with

hxxp://www.domain.com/index.php?url=9lp0IWv8QZY

by having most of the URL in the script so I can do mod_rewrite.

Sign In

strago

Posts

Joined

Last visited

Content Type

Profiles

Forums

Posts posted by strago

Get data from URL.

Get data from URL.

Get data from URL.

Get data from URL.

Data in text file instead of mySQL database.

Tiny little preg_match question.

Tiny little preg_match question.

Empty text file when there is over XXXX lines of text.

Empty text file when there is over XXXX lines of text.

Remove last line of text file.

Remove last line of text file.

Parse error: syntax error, unexpected T_ELSEIF

Parse error: syntax error, unexpected T_ELSEIF

Little else trouble.

Newest log first, and 'fuzy math.'

Parse error: syntax error, unexpected T_VARIABLE.

Multiple RewriteRule's seem to be Conflicting

Parse error: syntax error, unexpected T_VARIABLE.

Browse

Activity

Important Information