email extractor script

donatello · December 6, 2010

I am working on an email extractor script that will extract emails from a site.

I have a working script that will extract them from a single URL, but what I need it to do is to follow the links on the page.

Here is my email script:

<?php
$the_url = isset($_REQUEST['url']) ? htmlspecialchars($_REQUEST['url']) : '';
?>

<form method="post">
  Please enter full URL of the page to parse (including http://):<br />
  <input type="text" name="url" size="65" value="http://<?php echo str_replace('http://', '', $the_url);  ?>"/><br />
  or enter text directly into textarea below:<br />
  <textarea name="text" cols="50" rows="15"></textarea>
  <br />
  <input type="submit" value="Parse Emails" />
</form>

<?php
if (isset($_REQUEST['url']) && !empty($_REQUEST['url'])) {
  // fetch data from specified url
  $text = file_get_contents($_REQUEST['url']);
}
elseif (isset($_REQUEST['text']) && !empty($_REQUEST['text'])) {
  // get text from text area
  $text = $_REQUEST['text'];
}

// parse emails
if (!empty($text)) {
  $res = preg_match_all(
    "/[a-z0-9]+([_\\.-][a-z0-9]+)*@([a-z0-9]+([\.-][a-z0-9]+)*)+\\.[a-z]{2,}/i",
    $text,
    $matches
  );

  if ($res) {
    foreach(array_unique($matches[0]) as $email) {
      echo $email . "<br />";
    }
  }
  else {
    echo "No emails found.";
  }
}

?>
<!-- Email Extractor END -->

It's a bit rough and quirky, but it works for a single URL.

Here is the email extractor in action:

http://www.site-search.org/email-extractor.php

My ideal solution would be to combine this script with my URL extactor/link-extractor script:

<!-- URL Extractor BEGIN -->
<?php 
// findlinks.php 
// php code example: find links in an html page 
// mallsop.com 2006 gpl 


echo "<form method=post action=\"$PHP_SELF\"> \n"; 
echo "<p><table align=\"absmiddle\" width=\"100%\" bgcolor=\"#cccccc\" name=\"tablesiteopen\" border=\"0\">\n"; 
echo "<tr><td align=left>"; 
if ($_POST["FindLinks"]) { 
        $urlname = trim($_POST["urlname"]); 
        if ($urlname == "") { 
            echo "Please enter a URL. <br>\n"; 
            } 
        else { // open the html page and parse it 
    
            $page_title = "n/a"; 
            $links[0] = "n/a"; 
            //$meta_descr = "n/a"; 
            //$meta_keywd = "n/a"; 
    
            if ($handle = @fopen($urlname, "r")) { // must be able to read it 
                $content = ""; 
                while (!feof($handle)) { 
                    $part = fread($handle, 1024); 
                    $content .= $part; 
                    // if (eregi("</head>", $part)) break; 
                    } 
                fclose($handle); 
                $lines = preg_split("/\r?\n|\r/", $content); // turn the content into rows 
    
                    // boolean 
                $is_title = false; 
                //$is_descr = false; 
                //$is_keywd = false; 
                $is_href = false;                      
                    $index = 0; 

                //$close_tag = ($xhtml) ? " />" : ">"; // new in ver. 1.01 
                foreach ($lines as $val) { 
                    if (eregi("<title>(.*)</title>", $val, $title)) { 
                        $page_title = $title[1]; 
                        $is_title = true; 
                        } 
                            if (eregi("<a href=(.*)</a>", $val, $alink)) {      
                                
                              $newurl = $alink[1]; 
                                    $newurl = eregi_replace(' target="_blank"', "", $newurl); 
				 $newurl = eregi_replace(' rel="nofollow"', "", $newurl); 
                                    $newurl = eregi_replace(" title=\"(.*)\"","", $newurl);
                                    $newurl = trim($newurl); 
                                    $pos1 = strpos($newurl, "/>");                        
                                    if ($pos1 !== false) { 
                                            $newurl = substr($newurl, 1, $pos1); 
                                            } 
                                    $pos2 = strpos($newurl, ">");                        
                                    if ($pos2 !== false) { 
                                            $newurl = substr($newurl, 1, $pos2); 
                                            } 
                                    $newurl = eregi_replace("\"", "", $newurl); 
                                    $newurl = eregi_replace(">", "", $newurl); 
            
                                    //if (!eregi("http", $newurl)) { // local 
                                        //    $newurl = "http://".$_SERVER["HTTP_HOST"]."/".$newurl; 
                                    //    } 
                                    if (!eregi("http", $newurl)) { // local 
                                            $pos1 = strpos($newurl, "/");                        
                                            if ($pos1 == 0) { 
                                                $newurl = substr($newurl, 1); 
                                            } 
                                            $newurl = $urlname."/".$newurl; 
                                        }                                
                                    
                                    // put in array of found links 
                                    $links[$index] = $newurl; 
                                    $index++; 
                              $is_href = true; 
                                
                            } 
                    
                    } // foreach lines done 

	  echo "<h2>Extracted Links</h2>\n";
                echo "<p><b>Page Summary</b><br>\n"; 
                echo "<b>Url:</b> ".$urlname."<br>\n"; 
                if ($is_title) { 
                    echo "<b>Title:</b> ".$page_title."<br>\n"; 
                    } 
                else { 
                    echo "No title found<br>\n"; 
                    } 
                echo "<b>Links:</b><br>\n"; 
                if ($is_href) { 
                    foreach ($links as $myval) { 
                        echo "<a href=\"$myval\">".$myval."</a><br>\n"; 
                        } 
                    } 
                else { 
                    echo "No links found<br>\n"; 
                    } 
                echo "End</p>\n"; 
                } // fopen handle ok 
        else { 
            echo "<br>The url $urlname does not exist or there was an fopen error.<br>"; 
            }    
         echo "<br /><br /><h4><a href=\"http://www.site-search.org/url-extractor.php\" title=\"Link Extractor\">Try Again</a></h4>";
        } // end else urlname given 
    } // else find links now submit 
else { 
    $urlname = ""; // or whatever page you like 
    echo "<br /><br />\n";
    echo "<p><h2>Link Extractor</h2><br>\n"; 
    echo "File or URL: <input type=\"TEXT\" name=\"urlname\" value=\"http://\" maxlength=\"255\" size=\"80\">\n"; 
    echo "<input type=\"SUBMIT\" name=\"FindLinks\" value=\"Extract Links\"></font><br></p> \n"; 
    echo "<br /><br />\n";
    } 
echo "</td></tr>"; 
echo "</table></p>"; 
echo "</form></BODY></HTML>\n"; 

?>
<!-- URL Extractor END -->

Her e is the script in action:

http://www.site-search.org/url-extractor.php

trq · December 6, 2010

Do you have a question?

donatello · December 6, 2010

Do you have a question?

Yes.

I was looking to combine the two scripts and after several unsuccessful attempts am pleading for help...

The final script should be able to pull all of the links out of the page, as the link extractor I posted above already does. THEN, it should parse each of the found pages for email addresses and print them.

I'm not sure how to combine these two scripts to make this work.

Sign In

email extractor script

Recommended Posts

donatello

Link to comment

Share on other sites

trq

Link to comment

Share on other sites

donatello

Link to comment

Share on other sites

Archived

Browse

Activity

Important Information