Jump to content

Robot Detection Class - For Your Use


Muddy_Funster

Recommended Posts

I have knocked up this liitle class for retrieving a list of known robot user agents from the really rather helpfull people over at robotstxt.org.  It pulls info from their site and builds an array that can be used to compare against the $_SERVER['HTTP_USER_AGENT'] varable.  It has an exlusion array that can be altered to suit your personal prefferences and can be echoed directly to produce a valid JSON string that can be passed as is to a JQuery/Javascript using AJAX or anything of that like.  I am putting no restrictions on this, but the people over at the robotstxt.org do request that you give them a mention for accessing their data, so I leave that up to anyone who may want to use it.

 

Anyway, I found the need to be able to ensure bots didn't get free reign of the site I was making and thought that some other people out there may have a use for this.

 

Here it is, enjoy (maybe) - anyway let me know what you guys think of it.  (p.s. - I'm new to the whole DocBlock thing... ;) )

<?php
/**
 * Generates a list of robot useragent deffinitions for use with
 * $_SERVER['HTTP_USER_AGENT'] to identify robots
 *
 * This links into the robotstext.org site to access thier current
 * robot list.  It then produces an arrau of these useragents that
 * can be used to check if a visitor is a robot or not.  
 * Call: $yourVar = new getRobots();
 * $robotArray = $yourVar->robots;
 *
 * JSON output (if you want to pass to javascript): echo $yourVar;
 *
 *
 * @param string $url Link to robotstxt.org server
 * @param array $robots the array list of useragents
 * @return __toString Returns JSON string of Object{"robots":array[{"numericalKey":"useragentText"}]
 */
class getRobots{
    public $url;
    public $robots=array();

    public function __construct()
        $url = "http://www.robotstxt.org/db/all.txt"{
        $fullList = file($url);
        $exlusions = array //add lines here to include exclusions for any other agents in the list
            (
                "",
                "no",
                "Due to a deficiency in Java it's not currently possible to set the User-Agent.",
                "???",
                "no",
                "yes"
            );
        echo "<pre>";
        foreach ($fullList as $line=>$content){
            $delimit = ":";
            $split = explode($delimit, $content);
            if(trim($split['0']) == "robot-useragent"){
                $conCount = count($split);
                $agent = "";
                for($i=0;$i<$conCount;$i++){
                    if($i != 0){
                        $conPart = $i;
                        $agent .= " {$split[$conPart]} ";
                    }
                }
                array_push( $this->robots, trim($agent));
            }
        }
        foreach($this->robots as $key=>$agent){
            if(in_array($agent, $exlusions)){
                unset($this->robots[$key]);
            }
        }
    }
    public function __toString(){
        $json = "{\"robots\":[".json_encode($this->robots)."]}";
        return $json;
    }
}
?>
Link to comment
Share on other sites

OK, here's the revision:

<?php
/**
 * Generates a list of robot useragent deffinitions for use with
 * $_SERVER['HTTP_USER_AGENT'] to identify robots
 *
 * This links into the robotstext.org site to access thier current
 * robot list.  It then produces an arrau of these useragents that
 * can be used to check if a visitor is a robot or not.  
 * Call: $yourVar = new getRobots();
 * $robotArray = $yourVar->robots;
 * $yourVar->exclude(mixed $mixed); - send values to be excluded.
 *         Accepts either an array of values or a single string vlaue
 * JSON output (if you want to pass to javascript): echo $yourVar;
 *
 *
 * @param array $robots the array list of useragents
 * @return __toString Returns JSON string of Object{"robots":array[{"numericalKey":"useragentText"}]
 */
    class getRobots{
        public $robots=array();
    
        public function __construct($url = "http://www.robotstxt.org/db/all.txt"){
            $fullList = file($url);
            $exlusions = array //default exclusion list
                (
                    "Due to a deficiency in Java it's not currently possible to set the User-Agent.",
                    "Due to a deficiency in Java it's not currently possible",
                );
            echo "<pre>";
            foreach ($fullList as $line=>$content){
                $delimit = ":";
                $split = explode($delimit, $content);
                if(trim($split['0']) == "robot-useragent"){
                    $conCount = count($split);
                    $agent = "";
                    for($i=0;$i<$conCount;$i++){
                        if($i != 0){
                            $conPart = $i;
                            $agent .= " {$split[$conPart]} ";
                        }
                    }
                    array_push( $this->robots, trim($agent));
                }
            }
            $this->runExclusion($exlusions);
        }
        public function exclude($mixed){
            $this->runExclusion($mixed);
        }
        private function runExclusion($mixed){
            if(is_array($mixed)){
                foreach($this->robots as $key=>$agent){
                    if(in_array(trim($agent), $mixed)){
                        unset($this->robots[$key]);
                    }
                }
            }
            else{
                foreach($this->robots as $key=>$agent){
                    if(trim($agent) == trim($mixed)){
                        unset($this->robots[$key]);
                    }
                }
            }
        }
        public function __toString(){
            $json = "{\"robots\":[".json_encode($this->robots)."]}";
            return $json;
        }
    }
        
        
$robo = new getRobots();
$robo->exclude(array("",
                    "no",
                    "None",
                    "???",
                    "no",
                    "yes"
                    ));
var_dump($robo);
?>

Any other/new comments?

Link to comment
Share on other sites

Please be aware that I provide criticism in a very direct manner. So, do not read any personal attacks or derogatory intentions in my reply below

 

It is a waste to remotely access and process the file each time this script is run. The script should instead process the file into the final array and cache it locally. Then when the script is run, check to see if the remove file is newer than the last one processed. If not, use the cached file. If so, get the file and process it into a new cache file.

 

I don't understand the "exclude" array. How is someone wanting to use this supposed to know the structure and content of the exclude array?

 

I'm not understanding how the exclusions are done. The sample displayed doesn't make much sense to me. There is an if() statement based upon whether the input is an array or not. But, both the IF and ELSE statement blocks do the exact same thing! The ELSE block would fail for any other input - unless it is an object. I have to assume you didn't test this.

 

For the exclusions you will also need to cache something to prevent reprocessing the data each time. I would suggest having the script accept exclusions (as it does now) and then creating a checksum (e.g. MD5 hash or something similar) of all the exclusions and storing that. Then, when the script runs, verify if the currently generated checksum matches the stored one. If so, no need to process anything. Else, need to process the new exclusions.

 

I see some inefficiencies as well. For example, you explode a line first, then do a check to see if the first element equals "robot-useragent". If it does then there are several lines to act on that record. Why not just do a strpos() check before going to the trouble of exploding the line. Heck, I'd probably create a method to use with array_filter() to remove all lines that do not have "robot-useragent" and then you have an array with just the data that is needed.

 

What is the purpose of the method exclude()? It only takes the input value and passes it to the method runExclusion(). So, why not make runExclusion() a public method and remove the exclusion() method.

Link to comment
Share on other sites

OK, a couple other things.

 

From the manual

 

 

Note: If you use array_push() to add one element to the array it's better to use $array[] = because in that way there is no overhead of calling a function.

 

In the __contruct method a <pre> tag is being echo'd

Link to comment
Share on other sites

Hey Psycho, thanks for taking the time to pass that feedback.  I appreciate it.  I'll get on making some of the changes tomorrow morning once I'm back in the office.  In answer to some of what you said -

 

I wrote the class with the intention of passing a flag into session on pass, so the check would only be run once per agent / per visit.  It was such a tiny difference calling against the remote site V's a local copy I decided just to access the remote each time, plus being totally honest, I wouldn't know how to make a check on the files (local V remote) any more efficient than to load up the remote file each time anyway.

 

I will add some commenting to explain the exclude array better, I will confess I do tend to under comment code I write for myself.  The else in the runExclusion is there so that it can take either an array or a single string as input.  I did test this and it worked fine (at least I certainly thought that it did) though I will revisit it tomorrow and examine exactly what it's doing.

 

I am very interested in your suggestion of using the checksum, although I have never done anything using that kind of check.  I'll look into it, as soon as I get some time and see if I cant refactor the script to use it some time in the not to distant future.

 

I went down the explode / rebuild route as when I first started it looked like I was simply dealing with a tab delimited list, that turned out not to be the case as it was simply using spaces and not tabs.  I just instantly fell into the thought that "I want to split a string based on a delimiter" and wound up changing the code to fit the thought process rather than the other way round.  I will have a shot at the strpos() idea as it does sound cleaner.  I'll also look into apply the array_filter that you said. 

 

Using both the runExclusions and exclude methods instead of just using the exclude method.....to be honest looking at it now I don't even know what I was thinking, I'll get that sorted in the morning.

 

The pre tag is just my carelessness,  no excuses on that one, so it will go as soon as I get back in front of the code.  And I'll swap out the array_push for the use of $array[] at the same time.

 

Once again, thanks for taking the time to read over the code and comment on it.  Without the crit it (and I) wouldn't get any better.

Link to comment
Share on other sites

I wrote the class with the intention of passing a flag into session on pass, so the check would only be run once per agent / per visit.  It was such a tiny difference calling against the remote site V's a local copy I decided just to access the remote each time, plus being totally honest, I wouldn't know how to make a check on the files (local V remote) any more efficient than to load up the remote file each time anyway.

Something like this would only download the file if it has changed, otherwise you get a not modified response:

function downloadRobots($local){
	if (file_exists($local)){
		$mtime = filemtime($local);
		$ctx = stream_context_create(array(
			'http' => array(
				'header' => "If-modified-since: ".gmdate(DATE_RFC1123, $mtime)
			)
		));
	}
	else {
		$ctx = stream_context_create();
	}

	$fp = fopen("http://www.robotstxt.org/db/all.txt", 'rb', false, $ctx);
	$output = stream_get_contents($fp);
	$meta = stream_get_meta_data($fp);

	if (strpos($meta['wrapper_data'][0], ' 200 ') !== false){
		file_put_contents($local, $output);
	}

	fclose($fp);
}
You'd still send a request each time but the response would be smaller if no changes were made. You could expand it further to not send the request until the cache file is over x age.
Link to comment
Share on other sites

I would never rely upon getting an external file because it just created a single point of failure. And, besides, how often is the file updated? Unless it is continually updated it is a waste to fetch the file and do all the processing. As I stated, I would do ALL the processing (external file and exclusions) and store the results. That way you only need to get the file if a newer one exists and you only need to reprocess if a new file exists or the exclusions have changed.

 

The else in the runExclusion is there so that it can take either an array or a single string as input.  I did test this and it worked fine (at least I certainly thought that it did) though I will revisit it tomorrow and examine exactly what it's doing.

 

I don't see how that is possible. And, if it is, then you don't need an IF/ELSE since they do the exact same thing.

 

 

 

This will return an array of JUST the lines that you need

 

        private function getBotData()
        {
            $output = array();
            $inputArray = file($this->url);
            foreach($inputArray as $line)
            {
                if (strpos($line, 'robot-useragent')!==false)
                {
                    $output[] = $line;
                }
            }
            return $output;
        }

 

 

 

I am very interested in your suggestion of using the checksum, although I have never done anything using that kind of check.  I'll look into it, as soon as I get some time and see if I cant refactor the script to use it some time in the not to distant future.

 

Use the process Kicken provided to determine if you need to get a new file. Then, change the class to operate as follows:

 

The construct method should just set a URL property of the class an set defaults (e.g. $exclusions). Then have a method to set additional exclusions - but do not process them - just save to an array property. Then have your methods to return the complete list (e.g. __toString). When the user calls that method, check to see if the external file is newer than the cache (or if the cache does not exist). If so, get the new file. Then check if the exclusions are different than the ones in the cache file.

 

If the cached file is good and no changes in exclusions, return the previously saved results.

If the cached file is good and there are changes to exclusions, use the cached file to process the new results and save them.

If the cached file is not good, download a new one and process it with the current exclusions and save the results.

 

So, you should have an array of ALL the exclusions before ever trying to process the file. Take that array and create an MD5 hash and save that value. You could include it as part of the results file.

Link to comment
Share on other sites

@ kicken - Thanks for that, that's given me some reading to do, I haven't even looked at stream_ at all, so that's going to take some looking into.

 

@ psycho - I must have really screwed the code on the if/else - the IF was for if the $mixed was an array, it tries to match the array value from $mixed with the current iteration of $this->robots and drops it if it matches. The ELSE was to match the current iteration against the value of $mixed as a direct string comparison, allowing $robots->exclude($mixed) to be sent a single string or an array.  I'll definitely get in about it tomorrow and post up something taking your suggestions on board.

 

Thanks for the time and help guys, I appreciate it and will try to make the most of your comments.

Link to comment
Share on other sites

OK, been having a go at making the changes suggested.  Now I have to confess to getting a little bit lost in it all and I'm not sure if this is actually where I should have went with it, but here is what I am sitting with at the moment.  It creates the files on first run, and doesn't seem to update them if the conditions all match, but I'm not exactly sure if I have it set up right to handle partial changes :

<?php
/**
 * Generates a list of robot useragent deffinitions for use with
 * $_SERVER['HTTP_USER_AGENT'] to identify robots
 *
 * This links into the robotstext.org site to access thier current
 * robot list.  It then produces an arrau of these useragents that
 * can be used to check if a visitor is a robot or not.  
 * Call: $yourVar = new getRobots();
 * $robotArray = $yourVar->robots;
 * $yourVar->exclude(mixed $mixed); - send values to be excluded.
 *         Accepts either an array of values or a single string vlaue
 * JSON output (if you want to pass to javascript): echo $yourVar;
 *
 *
 * @param array $robots the array list of useragents
 * @return __toString Returns JSON string of Object{"robots":array[{"numericalKey":"useragentText"}]
 */
    class getRobots{
        public $robots;
        public $excludes;
        private $url;
        private $lfPath;
        private $lfFile;
        private $hashVals;
        private $output;
    
        public function __construct(){
            $this->url = "http://www.robotstxt.org/db/all.txt";
            $this->lfPath= substr(__FILE__,0,strripos(__FILE__,'\\')+1).'robots';
            $this->lfFile= '\\rbtList.txt';
            $this->excludes[] = "Due to a deficiency in Java it's not currently possible to set the User-Agent.";
            $this->excludes[] = "Due to a deficiency in Java it's not currently possible";
            if(!is_dir($this->lfPath)){
                if(!mkdir($this->lfPath)){
                    echo "error creating directory! PHP must have write permissions for this folder -- $lfPath";
                    return false;
                    exit;
                }
            }
        }
        public function setExclude($mixed){
            if(is_array($mixed)){
                foreach($mixed as $key=>$toExclude){
                    $this->excludes[] = $toExclude;
                }
            }
            else{
                $this->excludes[] = $mixed;
            }
        }
        public function getBots(){
            $this->checkHashes();
            $this->robots = file($this->lfPath."\\justBots.txt");
            
        }
        private function checkFile(){
            if (file_exists($this->lfPath.$this->lfFile)){
                $mtime = filemtime($this->lfPath.$this->lfFile);
                $ctx = stream_context_create(array(
                    'http' => array(
                        'header' => "If-modified-since: ".gmdate(DATE_RFC1123, $mtime)
                    )
                ));
            }
            else {
                $ctx = stream_context_create();
            }
            $fp = fopen("http://www.robotstxt.org/db/all.txt", 'rb', false, $ctx);
            $this->output = stream_get_contents($fp);
            $this->checkBotList();
            $meta = stream_get_meta_data($fp);
            if (strpos($meta['wrapper_data'][0], ' 200 ') !== false){
                file_put_contents($this->lfPath.$this->lfFile, $this->output);
            }
            fclose($fp);            
        }    
        private function checkBotList(){
            if(!empty($this->output)){
                $oEx = explode("\n", $this->output);
            }
            else{
                $oEx = file($this->lfPath."\\justBots.txt");
            }
        
            foreach ($oEx as $key=>$line){
                if(strpos($line, 'robot-useragent:') !== FALSE){
                    $robots[] = trim(substr($line, 16));
                }
            }
            if(isset($robots)){
                foreach($this->excludes as $exclude){
                    foreach(array_keys($robots, $exclude) as $key){
                        $drop[] = $key;
                    }
                }
                foreach($drop AS $idx){
                    unset($robots[$idx]);
                }
                array_unique($robots);
                $bf = fopen($this->lfPath."\\justBots.txt",'w');
                $bots = implode("\n", $robots);
                $this->hashVals[1] = md5($bots);
                fwrite($bf,$bots);
                fclose($bf);
            }    
        }
        private function checkHashes(){
            $this->hashVals[0] = md5(implode("\n", $this->excludes));
            if(!file_exists($this->lfPath.'\\mdHashFile.txt')){
                $this->checkFile();
                $hf = fopen($this->lfPath."\\mdHashFile.txt",'w');
                $hashOut = implode("\n", $this->hashVals);
                fwrite($hf, $hashOut);
            }
            else{
                $hfVals = file($this->lfPath."\\mdHashFile.txt");
                if($hfVals[0] != $this->hashVals[0]){
                    $this->checkBotList();
                }
                else{
                    $this->robots = file($this->lfPath."\\justBots.txt");
                }
            }
        }
        public function __toString(){
            $json = "{\"robots\":[".json_encode($this->robots)."]}";
            return $json;
        }
    }
        
        
$robo = new getRobots();
$robo->setExclude(array("", "no", "yes","null"));
$robo->getBots();
var_dump($robo->robots);

?>

Oh - and I haven't updated the DocBloc yet...going to wait untill it's accurate.

 

 

What do you guys think?  How wide of the mark am I?

Link to comment
Share on other sites

Well, I'm not really going to read through all of it - as I'm not planning to use it. But, here is something that stuck out to me

 

 

        public function setExclude($mixed){
            if(is_array($mixed)){
                foreach($mixed as $key=>$toExclude){
                    $this->excludes[] = $toExclude;
                }
            }
            else{
                $this->excludes[] = $mixed;
            }
        }

 

First off, it doesn't take into consideration the possibility of duplicate values. That's easy enough to handle. But, it's also more complicated than it needs to be. No need to do a foreach() loop to add the values when you can simply use array_merge(). Also, you can typecast the input as an array so there would be no need for the if/else condition.

 

 

function setExclude($exclusions)
{
    //Cast the value as an array
    $exclusions = (array) $exclusions;
    //Append to current list of exclusions
    $this->excludes = array_merge($this->excludes, $exclusions);
    //Remove duplicates
    $this->excludes = array_unique($this->excludes);
    return $this->excludes;
}
Link to comment
Share on other sites

OK guys, I think I have this nailed now.  I have changed the setExclude as you suggested Psycho, and have tested it in all ways I expect to use it.  I totally re-wrote the cached file checking because the first time round I made a total mess of it.  Anyway I have updated the DocBlock as best as I can work it out (shouldn't be too far off) and what started as the idea of having a check to feed only SEO content to bots and to only render forms to actual users has, with a not insignificant amount of help, turned into this :

/**
 * Generates a list of robot useragent deffinitions for use with
 * $_SERVER['HTTP_USER_AGENT'] to identify robots
 *
 * A Huge Thank You to Psycho, Kicken and Thorpe @ forums.phpfreaks.com
 * for their help and advice.
 *
 * This links into the robotstext.org site to access thier current
 * robot list.  It then produces an array of these useragents that
 * can be used to check if a visitor is a robot or not.  
 * Call: $yourVar = new getRobots();
 * Setter : $yourVar->setExclude(mixed $mixed)
 * Getter : $robotArray = $yourVar->getBots;
 * $yourVar->exclude(mixed $mixed); - send values to be excluded.
 *         Accepts either an array of values or a single string vlaue
 * JSON output (if you want to pass to javascript): echo $yourVar;
 *
 *
 * @param array $robots the array list of useragents
 * @param array $excludes array of exlusions from the bot list
 * @param string $url static url value for linking to the
 * @param string $lfPath path to generate subfolder to store cache files in
 * @param string $masterFile path to master cache file of robotstxt.org data
 * @param string $botFile path to cached bot file for qicker repeat array building
 * @param string $mdCheckFile path to md5Checksum cache to establish if cached bot file can be used
 * @param array $hashVals generated md5 values from current call
 * @param array $hashFileVals values from md5 checksum cache file use for comparison
 * @param string $output contents retrieved from robotstxt.org site
 * @return array getBots() returns array of robot user aganents
 * @return string __toString() Returns JSON string of Object{"robots":array[{"numericalKey":"useragentText"}]
 */
class getRobots{
    public $robots;
    public $excludes;
    private $url;
    private $lfPath;
    private $masterFile;
    private $botFile;
    private $mdCheckFile;
    private $hashVals;
    private $hashFileVals;
    private $output;

    public function __construct(){
        $this->url = "http://www.robotstxt.org/db/all.txt";
        $this->lfPath= substr(__FILE__,0,strripos(__FILE__,'\\')+1).'robots';
        $this->masterFile= $this->lfPath.'\\rbtList.txt';
        $this->botFile = $this->lfPath."\\allBots.txt";
        $this->mdCheckFile = $this->lfPath."\\mdHashFile.txt";
        $this->excludes[] = "Due to a deficiency in Java it's not currently possible to set the User-Agent.";
        $this->excludes[] = "Due to a deficiency in Java it's not currently possible";
        if(!is_dir($this->lfPath)){
            if(!mkdir($this->lfPath)){
                echo "error creating directory! PHP must have write permissions for this folder -- $lfPath";
                return false;
                exit;
            }
        }
    }
    public function setExclude($mixed){
        if(!is_array($mixed)){
            $mixed = (array)$mixed;
        }
    $this->excludes = array_merge($this->excludes, $mixed);
    $this->excludes = array_unique($this->excludes);
    }
    public function getBots(){
        $this->checkFile();
        $this->checkBotList();
    }
    private function checkFile(){
        if (file_exists($this->masterFile)){
            $mtime = filemtime($this->masterFile);
            $ctx = stream_context_create(array(
                'http' => array(
                    'header' => "If-modified-since: ".gmdate(DATE_RFC1123, $mtime)
                )
            ));
        }
        else {
            $ctx = stream_context_create();
        }
        $fp = fopen("http://www.robotstxt.org/db/all.txt", 'rb', false, $ctx);
        $this->output = stream_get_contents($fp);
        $this->checkBotList();
        $meta = stream_get_meta_data($fp);
        if (strpos($meta['wrapper_data'][0], ' 200 ') !== false){
            file_put_contents($this->masterFile, $this->output);
        }
        fclose($fp);            
    }    
    private function checkBotList(){
        $robots = array();
        $this->hashVals[0] = md5(implode("|",$this->excludes));
        if(!file_exists($this->mdCheckFile)){
            $fileVals = explode("\n",$this->output);                
        }
        else{
            $this->hashFileVals = file($this->mdCheckFile);
            if(trim($this->hashVals[0]) == trim($this->hashFileVals[0])){
                $this->robots = file($this->botFile);
                
            }
            else{
                $fileVals = file($this->masterFile);
            }
            
        }
        if(isset($fileVals)){
            foreach ($fileVals as $line=>$text){
                if (strpos($text, "robot-useragent:") !== FALSE){        
                    $robots[] = trim(substr($text,16));
                }
            }
            $filterRobs = array_diff($robots, $this->excludes);
            $filterRobs = array_unique($filterRobs);
            $this->robots = $filterRobs;
            $botOut = implode("\n", $filterRobs);
            $botHandle = fopen($this->botFile, 'w');
            fwrite($botHandle, $botOut);
            fclose($botHandle);
            $this->hashVals[1] = md5(implode("|", $filterRobs));
            $difCheck = array_diff($this->hashVals, (array)$this->hashFileVals);
            if(count($difCheck) >= 1){
                $writeback = implode("\n", $this->hashVals);
                $mdHandle = fopen($this->mdCheckFile, 'w');
                fwrite($mdHandle, $writeback);
            }
        }
    }
    public function __toString(){
        $json = "{\"robots\":[".json_encode($this->robots)."]}";
        return $json;
    }
}

If there is anything else let me know, and thanks again guys.

Link to comment
Share on other sites

 

    public function __toString(){
        $json = "{\"robots\":[".json_encode($this->robots)."]}";
        return $json;
    }

 

Rather than do part of the json manually, and part with json_encode, just do the whole thing with json_encode:

public function __toString(){
    return json_encode(array('robots' => $this->robots));
}

 

        $fp = fopen("http://www.robotstxt.org/db/all.txt", 'rb', false, $ctx);
        $this->output = stream_get_contents($fp);
        $this->checkBotList();
        $meta = stream_get_meta_data($fp);
        if (strpos($meta['wrapper_data'][0], ' 200 ') !== false){

 

There isn't really a need for the call to checkBotList there. You call it from getRobots() right after you call checkFile so it'd just get done twice. Also you still have the hard-coded url in the fopen call rather than calling $this->url like I think you intended.

 

 

        if(!is_dir($this->lfPath)){
            if(!mkdir($this->lfPath)){
                echo "error creating directory! PHP must have write permissions for this folder -- $lfPath";
                return false;
                exit;
            }
        }

 

Constructor functions do not return a value, nor should they cause a script to suddenly exit. Throw an exception to indicate that error that way the user can choose to ignore (or do something about) it if they wish.

        if(!is_dir($this->lfPath) && !mkdir($this->lfPath)){
            throw new RuntimeException("error creating directory! PHP must have write permissions for this folder -- $lfPath");
        }

 

    public function setExclude($mixed){
        if(!is_array($mixed)){
            $mixed = (array)$mixed;
        }

 

There is no need for the is_array check there. If $mixed is an array, then typecasting it to an array does nothing and $mixed is unchanged. If $mixed is a string, typecasting to an array is the same as doing $mixed=array($mixed);.

 

 

    public function getBots(){

 

Having a method named get* generally implies that it returns something. I'd suggest either you have that method return a value, or rename it, perhaps to updateBots.

 

 

Lastly, in your docblock you might include an example or two of how exactly this class is supposed to be used.

Link to comment
Share on other sites

Thanks again kicken, I have made the changes you suggested. Also, I have added a sort on the $this->exceptions array as it was failing the hash check if the same values were entered as the previous run but in a different order. 

/**
 * Generates a list of robot useragent deffinitions for use with
 * $_SERVER['HTTP_USER_AGENT'] to identify robots
 *
 * A Huge Thank You to Psycho, Kicken and Thorpe @ forums.phpfreaks.com
 * for their help and advice.
 *
 * This links into the robotstext.org site to access thier current
 * robot list.  It then produces an array of these useragents that
 * can be used to check if a visitor is a robot or not.  
 * Call: $yourVar = new getRobots();
 * Setter : $yourVar->setExclude(mixed $mixed)
 * Getter : $robotArray = $yourVar->makeBots;
 * $yourVar->exclude(mixed $mixed); - send values to be excluded.
 *         Accepts either an array of values or a single string vlaue
 * JSON output (if you want to pass to javascript): echo $yourVar;
 *
 * --------------------------------------------------------------
 * @example 1 : PHP BOT Check
 *
 * $bots = new getRobots;
 * $bots->setExclude(array("", "none", "no", "yes"));
 * $bots->makeBotList();
 * $botArray = $bots->robots;
 *
 * if(!in_array($_SERVER('HTTP_USER_AGENT'), $botArray){
 *        import_request_variables("g", "user_"); //example of something to do
 *         ...
 *         ...
 * }
 * else{
 *        echo "Bot Safe Site Visited"; //example of something to do
 *         ...
 *         ...
 * }
 * -------------------------------------------------------------
 * @example 2 : output to JSON
 *
 * $bots = new getRobots;
 * $bots->setExclude("");
 * $bots->setExclude("none");
 * $bots->setExclude("???");
 * $bots->setExclude("no");
 * $bots->setExclude("yes");
 * $bots->makeBotList();
 *
 * header("Content-type: application/json");
 * echo $bots;
 * exit;
 * -----------------------------------------------------------
 *
 * @param array $robots the array list of useragents
 * @param array $excludes array of exlusions from the bot list
 * @param string $url static url value for linking to the
 * @param string $lfPath path to generate subfolder to store cache files in
 * @param string $masterFile path to master cache file of robotstxt.org data
 * @param string $botFile path to cached bot file for qicker repeat array building
 * @param string $mdCheckFile path to md5Checksum cache to establish if cached bot file can be used
 * @param array $hashVals generated md5 values from current call
 * @param array $hashFileVals values from md5 checksum cache file use for comparison
 * @param string $output contents retrieved from robotstxt.org site
 * @return array getBots() returns array of robot user aganents
 * @return string __toString() Returns JSON string of Object{"robots":array[{"numericalKey":"useragentText"}]
 */
class getRobots{
    public $robots;
    public $excludes;
    private $url;
    private $lfPath;
    private $masterFile;
    private $botFile;
    private $mdCheckFile;
    private $hashVals;
    private $hashFileVals;
    private $output;

    public function __construct(){
        $this->url = "http://www.robotstxt.org/db/all.txt";
        $this->lfPath= substr(__FILE__,0,strripos(__FILE__,'\\')+1).'robots';
        $this->masterFile= $this->lfPath.'\\rbtList.txt';
        $this->botFile = $this->lfPath."\\allBots.txt";
        $this->mdCheckFile = $this->lfPath."\\mdHashFile.txt";
        $this->excludes[] = "Due to a deficiency in Java it's not currently possible to set the User-Agent.";
        $this->excludes[] = "Due to a deficiency in Java it's not currently possible";
        if(!is_dir($this->lfPath)){
            if(!mkdir($this->lfPath)){
                throw new RuntimeException("error creating directory! PHP must have write permissions for this folder -- $lfPath");
            }
        }
    }
    public function setExclude($mixed){
    $mixed = (array)$mixed;
    $this->excludes = array_merge($this->excludes, $mixed);
    $this->excludes = array_unique($this->excludes);
    sort($this->excludes);
    }
    public function makeBots(){
        $this->checkFile();
        $this->checkBotList();
    }
    private function checkFile(){
        if (file_exists($this->masterFile)){
            $mtime = filemtime($this->masterFile);
            $ctx = stream_context_create(array(
                'http' => array(
                    'header' => "If-modified-since: ".gmdate(DATE_RFC1123, $mtime)
                )
            ));
        }
        else {
            $ctx = stream_context_create();
        }
        $fp = fopen($this->url, 'rb', false, $ctx);
        $this->output = stream_get_contents($fp);
        $meta = stream_get_meta_data($fp);
        if (strpos($meta['wrapper_data'][0], ' 200 ') !== false){
            file_put_contents($this->masterFile, $this->output);
        }
        fclose($fp);            
    }    
    private function checkBotList(){
        $robots = array();
        $this->hashVals[0] = md5(implode("|",$this->excludes));
        if(!file_exists($this->mdCheckFile)){
            $fileVals = explode("\n",$this->output);                
        }
        else{
            $this->hashFileVals = file($this->mdCheckFile);
            if(trim($this->hashVals[0]) == trim($this->hashFileVals[0])){
                $this->robots = file($this->botFile);
                
            }
            else{
                $fileVals = file($this->masterFile);
            }
            
        }
        if(isset($fileVals)){
            foreach ($fileVals as $line=>$text){
                if (strpos($text, "robot-useragent:") !== FALSE){        
                    $robots[] = trim(substr($text,16));
                }
            }
            $filterRobs = array_diff($robots, $this->excludes);
            $filterRobs = array_unique($filterRobs);
            $this->robots = $filterRobs;
            $botOut = implode("\n", $filterRobs);
            $botHandle = fopen($this->botFile, 'w');
            fwrite($botHandle, $botOut);
            fclose($botHandle);
            $this->hashVals[1] = md5(implode("|", $filterRobs));
            $difCheck = array_diff($this->hashVals, (array)$this->hashFileVals);
            if(count($difCheck) >= 1){
                $writeback = implode("\n", $this->hashVals);
                $mdHandle = fopen($this->mdCheckFile, 'w');
                fwrite($mdHandle, $writeback);
            }
        }
    }
    public function __toString(){
        return json_encode(array('robots' => $this->robots));
    }
}
Link to comment
Share on other sites

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.