linuxdream Posted January 29, 2007 Share Posted January 29, 2007 Hey all,I've been working on a class called ripCURL that is built upon the cURL libraries for PHP. Basically, the class makes it easy to automatically navigate through web sites and store/parse the content. I built it on my on time (so that my evil corp. can't steal the rights to it) to be used at work. However it seems very useful for integrating multiple sites together and automatically retrieving content for live updates, DB storage, etc. Please have a look and let me know of any bug/funny things that occur. The code is located at http://ripcurl.sdsustudent.com or at Soruceforge.net under ripcurl.The tutorial and documentation is at the first link in PDF or .doc format.The basic structure is like so:[code]<?phprequire_once('ripcurl.class.php');$rip = new ripcurl();$rip->ripRun('http://www.somesite.com', 1);echo $rip->getRawHtml();?>[/code]Thanks,Brandon C.[code]<?php//RipCURL class 0.5.4 - SD Linux Services//This class was written by Brandon Ching of SD Linux Services//It is licensed under the GPL 2 and requires PHP5 and CURL 7class ripCurl{ //Class definitions //Define the default directory for ripWrite. Must include trailing "/". Please turn safe_mode off or make the proper UID/GID changes //to the php.ini file or in the ripWrite function below. The directory should be located under the web root directory for links to work //or use a directory alias in httpd.conf. const WRITEDIRECTORY = "/tmp/"; //Set class variables //curl handler private $ch; //Returned value of curl_exec. This value should never be directly written to. It is meant to be used by class methods only. //It always holds the current HTML of the last page to be fetched/processed. private $rawHtml; //Value of the last POST fields private $postData; //Error variable private $error; //Total HTTP errors for class instance private $httpErrors = 0; //Count for ripWrite private $writeCount = 0; //Count for ripGetlinks private $linksCount = 0; //Total redirects for class instance private $totalRedirects = 0; //Total time for class existance private $totalTime = 0; //Total size of information transfered private $totalSize = 0; //Average speed of all transfers private $averageSpeed = 0; //Average time per transfer private $averageTime = 0; //Total number of connections for class existance private $totalConnections = 0; //Last ripRun status code private $lastStatusCode = null; //Initialize cookiejar directory. Filename get's set in the constructor private $cookiejar = "/tmp/"; //Accessor methods public function getRawHtml(){ return $this->rawHtml; } public function getPostData(){ return $this->postData; } public function getHttpErrors(){ return $this->httpErrors; } public function getWriteCount(){ return $this->writeCount; } public function getLinksCount(){ return $this->linksCount; } public function zeroLinksCount(){ if($this->linksCount = 0){ return true; }else{ $this->error = "zeroLinksCount: Could not reset linksCount variable."; return false; } } public function getTotalRedirects(){ return $this->totalRedirects; } public function getTotalTime(){ return $this->totalTime; } public function getTotalSize(){ return $this->totalSize; } public function getAverageSpeed(){ return $this->averageSpeed; } public function getAverageTime(){ return $this->averageTime; } public function getTotalConnections(){ return $this->totalConnections; } public function getCookieJar(){ return $this->cookiejar; } public function getError(){ return $this->error; } public function getLastStatus(){ return $this->lastStatus; } public function getLastUrl(){ return $this->lastUrl; } public function setCookieJar($o){ if(is_writeable($o) && $this->cookiejar = $o){ $this->cookieJar($o); $this->cookieFile($o); return true; }else{ $this->error = "setCookieJar: Cookiejar could not be set"; return false; } } //Constructor method. Designed to set standard defaults for pulling sites into a variable public function ripCurl(){ //set_time_limit(10000); $this->cookiejar = $this->cookiejar . md5(date('l dS \of F Y h:i:s A')+rand(5, 50)) . ".cookiejar.txt"; $this->ch = curl_init(); curl_setopt($this->ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($this->ch, CURLOPT_AUTOREFERER,1); curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION,1); curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST,0); curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER,0); curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT,30); curl_setopt($this->ch, CURLOPT_COOKIEJAR, $this->cookiejar); curl_setopt($this->ch, CURLOPT_COOKIEFILE, $this->cookiejar); } //Class methods for setting curlopt variables public function autoReferer($o = 1){ curl_setopt($this->ch, CURLOPT_AUTOREFERER, $o); } public function cookieSession($o = 1){ curl_setopt($this->ch, CURLOPT_COOKIESESSION, $o); } public function failOnError($o = 1){ curl_setopt($this->ch, CURLOPT_FAILONERROR, $o); } public function followLocation($o = 1){ curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, $o); } public function forbidReuse($o = 1){ curl_setopt($this->ch, CURLOPT_FORBID_REUSE, $o); } public function freshConnect($o = 1){ curl_setopt($this->ch, CURLOPT_FRESH_CONNECT, $o); } public function showHeader($o = 1){ curl_setopt($this->ch, CURLOPT_HEADER, $o); } public function httpGet($o = 1){ curl_setopt($this->ch, CURLOPT_HTTPGET, $o); } public function mute($o = 1){ curl_setopt($this->ch, CURLOPT_MUTE, $o); } public function noSignal($o = 1){ curl_setopt($this->ch, CURLOPT_NOSIGNAL, $o); } public function post($o = 1){ curl_setopt($this->ch, CURLOPT_POST, $o); } public function put($o = 1){ curl_setopt($this->ch, CURLOPT_PUT, $o); } public function returnTransfer($o = 1){ curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, $o); } public function sslVerifyPeer($o = 1){ curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, $o); } public function sslVerifyHost($o = 1){ curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, $o); } public function verbose($o = 1){ curl_setopt($this->ch, CURLOPT_VERBOSE, $o); } public function bufferSize($o){ curl_setopt($this->ch, CURLOPT_BUFFERSIZE, $o); } public function connectTimeout($o){ curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT, $o); } public function dnsCacheTimeout($o){ curl_setopt($this->ch, CURLOPT_DNS_CACHE_TIMEOUT, $o); } public function httpVersion($o){ curl_setopt($this->ch, CURLOPT_HTTP_VERSION, $o); } public function httpAuth($o){ curl_setopt($this->ch, CURLOPT_HTTPAUTH, $o); } public function maxConnects($o){ curl_setopt($this->ch, CURLOPT_MAXCONNECTS, $o); } public function port($o){ curl_setopt($this->ch, CURLOPT_PORT, $o); } public function maxRedirs($o){ curl_setopt($this->ch, CURLOPT_MAXREDIRS, $o); } public function sslVersion($o){ curl_setopt($this->ch, CURLOPT_SSLVERSION, $o); } public function timeOut($o){ curl_setopt($this->ch, CURLOPT_TIMEOUT, $o); } public function cookie($o){ curl_setopt($this->ch, CURLOPT_COOKIE, $o); } public function cookieJar($o){ curl_setopt($this->ch, CURLOPT_COOKIEJAR, $o); } public function cookieFile($o){ curl_setopt($this->ch, CURLOPT_COOKIEFILE, $o); } public function encoding($o){ curl_setopt($this->ch, CURLOPT_ENCODING, $o); } public function postFields($o){ curl_setopt($this->ch, CURLOPT_POSTFIELDS, $o); } public function referer($o){ curl_setopt($this->ch, CURLOPT_REFERER, $o); } public function url($o){ curl_setopt($this->ch, CURLOPT_URL, $o); } public function userAgent($o){ switch ($o){ case 'ie6': $o = 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1)'; break; case 'ie7': $o = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'; break; case 'ff': $o = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; es-ES; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6'; break; case 'google': $o = 'Googlebot/2.1 (+http://www.google.com/bot.html)'; break; case 'msn': $o = 'msnbot/1.0 (+http://search.msn.com/msnbot.htm)'; break; case 'yahoo': $o = 'Mozilla/5.0 (compatible; Yahoo! Slurp;http://help.yahoo.com/help/us/ysearch/slurp)'; break; } curl_setopt($this->ch, CURLOPT_USERAGENT, $o); } //Username and password in uname:pass format public function userPwd($o){ curl_setopt($this->ch, CURLOPT_USERPWD, $o); } public function httpHeader($o){ curl_setopt($this->ch, CURLOPT_HTTPHEADER, $o); } ///////////////////////////////////////////////////////////////////////////////////// // Begin major class methods // ///////////////////////////////////////////////////////////////////////////////////// //Used to display a summary of the entire ripCurl session. // //If a parameter is specified, it will return only that key value. //Otherwise, return an array // public function getInfo($o = null){ if(!is_null($o)){ $info = curl_getinfo($this->ch); $value = $info[$o]; return $value; } else { $info = curl_getinfo($this->ch); return $info; } } ///////////////////////////////////////////////////////////////////////////////////// //Get contents of cookiefile // //Returns an associative array of cookie parts // public function getCookieJarContents($cookiefile = null){ if(is_null($cookiefile)){ $cookiefile = $this->cookiejar; } if(!is_readable($cookiefile)){ $this->error = "ripGetCookieContents: Cookie file is not readable"; return false; } $contents = file_get_contents($cookiefile); preg_match_all('|.*\t.*|', $contents, $dough); $cookies = array(); for($i = 0; $i < count($dough[0]); $i++){ $c = explode("\t", $dough[0][$i]); $cookies[$i]['host'] = $c[0]; $cookies[$i]['secure'] = $c[1]; $cookies[$i]['path'] = $c[2]; $cookies[$i]['httpOnly'] = $c[3]; $cookies[$i]['expire'] = $c[4]; $cookies[$i]['name'] = $c[5]; $cookies[$i]['value'] = $c[6]; } return $cookies; } ///////////////////////////////////////////////////////////////////////////////////// //Sets a cookie accroding to Netscape cookie files. // //Returns TRUE on successfull cookie write // public function writeCookie($cookie){ if(!is_array($cookie) || count($cookie) != 7){ $this->error = "setCookie: Argument must be an array with host, secure, path, httpOnly, expire, name, and value"; return false; } $cookiefile = $this->cookiejar; if(!is_writable($cookiefile)){ $this->error = "Cookie file is not writeable"; return false; } $fileh = fopen($cookiefile, 'a+'); $cookieString = implode("\t", $cookie); if(fwrite($fileh, $cookieString) === FALSE){ $this->error = "setCookie: Could not write cookie to file"; return false; } fclose($fileh); return true; } ///////////////////////////////////////////////////////////////////////////////////// //Pulls all HTML between the $start and $end and returns it. // //Returns array of captured content // public function ripInBetween($start, $end, $greedy = null, $html = null){ if(is_null($html)){ $html = $this->getRawHtml(); } if(is_null($greedy)){ $needle="|$start" . "(.*?)" . "$end|is"; }else{//Do greedy search $needle="|$start" . "(.*)" . "$end|is"; } preg_match_all($needle, $html, $result, PREG_PATTERN_ORDER); return $result[1]; } ///////////////////////////////////////////////////////////////////////////////////// //Search and replace function. // //Returns value of $html after replacement. Also sets replaced value to $rawHtml // public function sandr($search, $replace, $html = null){ if(is_null($html)){ $html = $this->getRawHtml(); } $html = preg_replace("|$search|is", $replace, $html); $this->rawHtml = $html; return $html; } ///////////////////////////////////////////////////////////////////////////////////// //Removes all javascript from the passed html // //Returns clean HTML. Also sets value to $rawHtml // public function ripJS($html = null){ if(is_null($html)){ $html = $this->getRawHtml(); } $clean = preg_replace('|<script.*?/script>|is', '', $html); $this->rawHtml = $clean; return $clean; } ///////////////////////////////////////////////////////////////////////////////////// //Removes all style sheets and <style> contents. // //Returns clean HTML. Also sets value to $rawHtml // public function ripStyles($html = null){ if(is_null($html)){ $html = $this->getRawHtml(); } $clean = preg_replace('|<style.*?/style>|is', '', $html); $this->rawHtml = $clean; return $clean; } ///////////////////////////////////////////////////////////////////////////////////// //Method to get all links in a page. If the $id is specified, returned links must //contain $id. Link counts are automatically tracked in the $this->linksCount class //variable // //Returns array of all links found with $id specified // public function getLinks($id = null, $html = null){ if(is_null($html)){ $html = $this->getRawHtml(); } $links = array(); $pattern = '/<A\s*HREF=[\"\']?([^\"\'>]*)[\"\']?[^>]*>(.*?)<\/A>/is'; $idpattern = '|' . $id . '|'; if (preg_match_all($pattern, $html, $matches, PREG_SET_ORDER)) { if(!is_null($id)){ foreach($matches as $match){ if(preg_match($idpattern, $match[1])){ array_push($links, $match[1]); $this->linksCount++; } } } else { $this->linksCount = count($matches); return $matches; } } return $links; } ///////////////////////////////////////////////////////////////////////////////////// //Search for string within $html. Set $caseSensitive to true for case sensitive search // //Returns true if $needle found within // public function inData($needle, $caseSensitive = null, $html = null){ if(is_null($html)){ $html = $this->getRawHtml(); } if(is_null($caseSensitive)){ $search = "|$needle|is"; }else{ $search = "|$needle|s"; } if(preg_match($search, $html)){ return true; }else{ return false; } } ///////////////////////////////////////////////////////////////////////////////////// //Deletes specified directory and its contents. // //Returns TRUE if directory and contents were successfully deleted // public function clearDir($dir){ //remove leading / if it's there if (substr($dir, 0, 1) == '/'){ preg_match('|/(.*)|', $dir, $match); $dir=$match[1]; } $ogdir = $dir; $dir = self::WRITEDIRECTORY . $dir; if ($handle = @opendir("$dir")) { while (false !== ($item = readdir($handle))) { if ($item != "." && $item != "..") { if(!@unlink("$dir/$item")){ if(is_dir("$dir/$item")){ $this->clearDir("$ogdir/$item"); } } } } closedir($handle); }else{ $this->error = "clearDir: Could not open $dir for delete"; return false; } if(rmdir($dir)){ return true; } else { $this->error = "clearDir: There was a problem clearing $dir."; return false; } } ///////////////////////////////////////////////////////////////////////////////////// //Writes the value of $html to $dir directory. If $html is not specified, it uses //$this->rawHtml. If safe_mode is on in php.ini, directories must be same UID/GID //(if relaxed to GID) as the sctipt calling this function. If a filename is specified, //it will write to that file, otherwise, it will create a unique filename in the dir //specified. Filename must have .extension // //Returns TRUE if write was successfull // public function write($dir, $html = null){ //If no data was passed, assign the class variable if(is_null($html)){ $html = $this->rawHtml; } //remove leading / if it's there if (substr($dir, 0, 1) == '/'){ preg_match('|/(.*)|', $dir, $match); $dir=$match[1]; } $dir = self::WRITEDIRECTORY . $dir; $path_info = pathinfo($dir); if(!strstr($path_info['basename'], '.')){ if (substr($dir, -1, 1) != '/'){ $dir=$dir . "/"; } //Check if dir already exists if(!is_dir($dir)){ //file is not specified, create directory if(exec('mkdir -p ' . $dir)){ $this->error = "write: Creation of $dir was unsuccessfull"; return false; } } }else{ //file was specified, create path upto filename if(!is_dir($path_info['dirname'])){ if(exec('mkdir -p '. $path_info['dirname'])){ $this->error = "write: Creation of $dir was unsuccessfull"; return false; } } } //If directory was specified, cd to it and enable random filenames. if(is_dir($dir)){ chdir($dir); $randomName = 1; }else{ //Dir not specified, just cd into dirname to create file. chdir($path_info['dirname']); } //Assures a unique filename and no duplication //Check for specified file, if not, then create unique filename if($randomName){ $hash = md5($html) . ".html"; $abspath = $dir . $hash; $file = fopen($abspath, "w"); }else{ $file = fopen($dir, 'w'); } if(fwrite($file, $html)){ $this->writeCount++; } else { $this->error = "write: There was an error writing to $abspath."; return false; } fclose($file); return true; } ///////////////////////////////////////////////////////////////////////////////////// //Writes $image to $image_name. $image_name will be appended to WRITEDIRECTORY constant. //$image can be actual binary image or image URL // //Returns true if image wrote to disk successfully // public function writeImage($image, $img_name){ if(empty($image)){ $this->error = "writeImage: There is no image to write."; return false; } //remove leading / if it's there if (substr($img_name, 0, 1) == '/'){ preg_match('|/(.*)|', $img_name, $match); $img_name=$match[1]; } $img_name = self::WRITEDIRECTORY . $img_name; $path_info = pathinfo($img_name); //file was specified, create path upto filename if(!is_dir($path_info['dirname'])){ if(exec('mkdir -p '. $path_info['dirname'])){ $this->error = "write: Creation of $img_name was unsuccessfull"; return false; } } //Allows for either actual binary image file or web address if(preg_match('/^[http:|https:|ftp:]/i', $image)){ $image = $this->ripRun($image); } $file = fopen($img_name, "w"); if(fwrite($file, $image)){ return true; } else { $this->error = "writeImage: Could not write image"; return false; } } ///////////////////////////////////////////////////////////////////////////////////// //Removes any alert or confirm javascript popups. // //Returns clean HTML. Also sets $rawHtml to clean value. public function ripJSPopups($html = null){ if(is_null($html)){ $html = $this->rawHtml; } $clean = preg_replace('/alert\(.*?\)[;]/', '', $html); $realclean = preg_replace('/confirm\(.*?\)[;]/', '', $clean); $this->rawHtml = $realclean; return $this->rawHtml; } ///////////////////////////////////////////////////////////////////////////////////// //Makes all ralative links absolute with the provided $basrUrl. This function is usually //only used by ripRun() when specified in argument 2. // //Returns clean HTML. Also sets $rawHtml to cleaned value // public function fixLinks($baseUrl, $html = null){ if(is_null($html)){ $html = $this->rawHtml; } $tagAttributes=array( 'table'=>'background', 'td'=>'background', 'tr'=>'background', 'th'=>'background', 'body'=>'background', 'a'=>'href', 'link'=>'href', 'area'=>'href', 'form'=>'action', 'script'=>'src', 'img'=>'src', 'iframe'=>'src', 'frame'=>'src', 'embed'=>'src'); //Get hostname for relative URL's $host = parse_url($baseUrl); $host = $host['scheme'] . "://" . $host['host']; if(preg_match('/<base(?:.*?)href=["\']?([^\'"\s]*)[\'"\s]?/is', $html, $base)){ $baseUrl = $base[1]; $host = $baseUrl; } // Append a trailing slash to the url if it doesn't exist if (substr($baseUrl, -1, 1) !='/'){ $baseUrl.='/'; } //$html = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' . $html; $doc = new DOMDocument(); @$doc->loadHTML($html); foreach($tagAttributes as $tag=>$attribute){ $tagType = $doc->getElementsByTagName($tag); foreach($tagType as $link){ $url = $link->getAttribute($attribute); if(substr($url, 0, 1) == "/"){ if(substr($url, 1, 2) == ".."){ //Leave as if relative since /../needs to maintain its current path $link->setAttribute($attribute, $baseUrl . $link->getAttribute($attribute)); } $link->setAttribute($attribute, $host . $link->getAttribute($attribute)); }elseif(preg_match('/^https?:|ftp:|javascript:/', $url)){ //Don't need to change anything continue; }else{ $link->setAttribute($attribute, $baseUrl . $link->getAttribute($attribute)); } } } $html = $doc->saveHTML(); $this->rawHtml = $html; return $html; } ///////////////////////////////////////////////////////////////////////////////////// //Find all links to other files...inside body, img, iframe, etc... // //Returns array of all possible external links found // function getAllLinks($html = null){ if(is_null($html)){ $html = $this->rawHtml; } $links = array(); $tagAttributes=array( 'table'=>'background', 'td'=>'background', 'tr'=>'background', 'th'=>'background', 'body'=>'background', 'a'=>'href', 'link'=>'href', 'area'=>'href', 'form'=>'action', 'script'=>'src', 'img'=>'src', 'iframe'=>'src', 'frame'=>'src', 'embed'=>'src'); // Single, double, and no quotes are both supported foreach($tagAttributes as $tag=>$attribute){ $pattern="/<$tag([^>]*) $attribute=[\"']?([^\"' ]*)[\"']?/is"; preg_match_all($pattern, $html, $matches); $links = array_merge($links, $matches[2]); } return $links; } ///////////////////////////////////////////////////////////////////////////////////// //Finds all hidden form elements within specified $formName. If empty, it searches //the entire document. // //Returns asociative array (element name=>element value) of found elements and their //current values // public function getHiddenFormElements($formName = null, $html = null){ if(is_null($html)){ $html = $this->rawHtml; } if(!is_null($formName)){ preg_match("|<form.*?name=[\"']?" . $formName . "[\"']?.*?</form>|is", $html, $found); $html = $found[0]; } $pattern = "|<input.*?type=[\"']?hidden[\"']?(.*?)>|is"; preg_match_all($pattern, $html, $matches); $hiddenArray = array(); foreach($matches[0] as $hidden){ $pattern = "|name=[\"']?([^\"' ]*)[\"']?|is"; preg_match_all($pattern, $hidden, $name); $pattern = "|value=[\"']?([^\"' ]*)[\"']?|is"; preg_match_all($pattern, $hidden, $value); $hiddenArray[$name[1][0]] = $value[1][0]; } return $hiddenArray; } ///////////////////////////////////////////////////////////////////////////////////// //Finds all form elements within specified $formName. If empty, it searches the entire //document. // //Returns asociative array (element name=>element value) of found elements and their //current values // public function getFormElements($formName = null, $html = null){ if(is_null($html)){ $html = $this->rawHtml; } if(!is_null($formName)){ preg_match("|<form.*?name=[\"']?" . $formName . "[\"']?.*?</form>|is", $html, $found); $html = $found[0]; } $pattern = "|<input.*?>|is"; preg_match_all($pattern, $html, $matches); $elementsArray = array(); $newMatches = array(); //Parse through input elements and deal with checkboxes and radio buttons //Put them into the newMatches array foreach($matches[0] as $match){ preg_match('|type=[\'"]?([^\'"]*)[\'"]?|is', $match, $type); switch($type[1]){ case 'radio': if(preg_match('|\schecked|is', $match)){ $newMatches[] = $match; } break; case 'checkbox': if(preg_match('|\schecked|is', $match)){ $newMatches[] = $match; } break; case 'button': continue; break; case 'submit': continue; break; default: $newMatches[] = $match; } } //Deal with select statements preg_match_all('|<select.*?</select>|is', $html, $selectHtml); foreach($selectHtml[0] as $select){ preg_match('|name=[\'"]?([^\'" ]*)[\'"]?|', $select, $name); //Get all options preg_match_all('|<option.*?/option>|is', $select, $options); foreach($options[0] as $option){ if(preg_match('|\sselected|is', $option)){ preg_match('|value=[\'"]?([^\'" ]*)[\'"]?|is', $option, $value); $elementsArray[$name[1]] = $value[1]; } } } //Finally textarea preg_match_all('|<textarea.*?/textarea>|is', $html, $textarea); foreach($textarea[0] as $text){ preg_match('|name=[\'"]?([^\'" ]*)[\'"]?|is', $text, $name); preg_match('|<textarea.*?>([^\'" ]*)</textarea>|is', $text, $value); $elementsArray[$name[1]] = $value[1]; } foreach($newMatches as $element){ $pattern = "|name=[\"']?([^\"' ]*)[\"']?|is"; preg_match_all($pattern, $element, $name); $pattern = "|value=[\"']?([^\"' ]*)[\"']?|is"; preg_match_all($pattern, $element, $value); $elementsArray[$name[1][0]] = $value[1][0]; } return $elementsArray; } ///////////////////////////////////////////////////////////////////////////////////// //Gets form information // //Returns array of form information // public function getFormInfo($formName, $html = null){ if(is_null($html)){ $html = $this->rawHtml; } $pattern = "|action=[\"']?([^\"' ]*)[\"']?|is"; preg_match($pattern, $html, $action); $pattern = "|method=[\"']?([^\"' ]*)[\"']?|is"; preg_match($pattern, $html, $method); $return['action'] = $action[1]; $return['method'] = $method[1]; return $return; } ///////////////////////////////////////////////////////////////////////////////////// //Some general statistics for the current ripCurl session. This is a cumulative total //of all ripRun()'s. // //Returns HTML of totals. // public function getStats(){ return $stats = "<br /><br /> Total Connections: $this->totalConnections<br /> Total Time: $this->totalTime<br /> Total Download Size: $this->totalSize<br /> Average Connection Speed: $this->averageSpeed<br /> Total Redirects: $this->totalRedirects<br /> Average Time per Connection: $this->averageTime<br />"; } ///////////////////////////////////////////////////////////////////////////////////// //Assigns POST data and enables POST settings in the CURL object. Primarily used //from 3rd argument of ripRun(). // //Returns true if settings set and data ready for ripRun(). // public function postRequest($data){ if(empty($data)){ $this->error = "postRequest: No post data specified"; return false; } $this->post(); $string = ""; if(!is_array($data)){ $this->postFields($data); $this->postData = $string; } else { foreach($data as $k=>$v){ $string .= utf8_encode($k) . "=" . utf8_encode($v) . "&"; } $string = rtrim($string, '&'); if($this->postFields($string) && $this->postData = $string){ return true; }else{ return false; } } } ///////////////////////////////////////////////////////////////////////////////////// //Main ripCurl execution method. $url is the URL to pull from. if not set, it is taken //from $this->ripUrl. $fixlinks is boolean option to automatically fix relative links. //$postdata can be array or urlencoded string. Result page is both returned and set //in $this->rawHtml. // //Returns HTML content retrieved from URL query. // public function ripRun($url, $fixLinks = null, $postdata = null){ $this->url($url); if(!is_null($postdata)){ $this->postRequest($postdata); } $this->rawHtml = curl_exec($this->ch); $this->lastUrl = curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL); if($fixLinks){ $parse = parse_url($url); $host = $parse['scheme'] . "://" . $parse['host'] . $parse['path']; //$host = $path; $this->fixLinks($host, $this->rawHtml); } //Stats tracking $this->totalConnections = $this->totalConnections + 1; $this->totalTime = $this->totalTime + $this->getInfo('total_time'); $this->totalSize = $this->totalSize + $this->getInfo('size_download'); $this->averageSpeed = ($this->averageSpeed + $this->getInfo('speed_download')) / $this->totalConnections; $redir = $this->getInfo('redirect_time'); if(!empty($redir)){ $this->totalRedirects = $this->totalRedirects + 1; } $this->averageTime = $this->totalTime / $this->totalConnections; $this->lastStatus = curl_getinfo($this->ch, CURLINFO_HTTP_CODE); return $this->getRawHtml(); } ///////////////////////////////////////////////////////////////////////////////////// //XML methods ///////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// //Parses XML file or http address where the output is valid XML and returns a DOM //NodeList object // //Returns DOMNodeList object // public function getXMLArray($filename){ if(!is_file($filename)){ $this->error = "getXMLArray: Bad filename $filename"; return false; } $contents = file_get_contents($filename); if(!$this->inData('^<\?[.*?]xml version=[.*?]\?>', null, $contents)){ $contents = '<?xml version="1.0"?>' . $contents; } $xml= new DOMDocument(); $xml->preserveWhiteSpace=false; $xml->loadXML($contents); $result = $this->xml2array($xml); return $result; } ///////////////////////////////////////////////////////////////////////////////////// //Creates array of XML file or existing XML array and pull out only the desired named //fields. // //Returns array of named fields // public function getXMLElements($filename, $elementName){ if(is_file($filename) && is_readable($filename)){ $data = $this->getXMLArray($filename); }elseif(is_array($filename)){ $data = $filename; }else{ $this->error = "getXMLElements: Bad filename or array given"; return false; } foreach($data as $k=>$v){ if(is_array($v)){ $this->getXMLElements($v, $elementName); }elseif($k == $elementName){ $return[] = $v; } } return $return; } ///////////////////////////////////////////////////////////////////////////////////// //Helper function for XML parsing // //Returns XML structure as array // private function xml2array($n){ $return=array(); foreach($n->childNodes as $nc){ if($nc->hasChildNodes()){ if($n->firstChild->nodeName== $n->lastChild->nodeName&&$n->childNodes->length>1){ $return[$nc->nodeName][]=$this->xml2array($nc); }else{ $return[$nc->nodeName]=$this->xml2array($nc); } }else{ $return=$nc->nodeValue; } } return $return; } ///////////////////////////////////////////////////////////////////////////////////// //This section is to layout future development ideas and projects // //Expansion ideas: //Expand image features with GD lib //Build XML parsing methods //Include validation checks for XML, HTML, CSS, etc. //Include limited JavaScript parsing capabilities //Build pagination methods with callback functionality // ///////////////////////////////////////////////////////////////////////////////////// }?>[/code] Link to comment https://forums.phpfreaks.com/topic/36235-ripcurl-class-for-retrieving-storing-and-parsing-web-content-through-curl/ Share on other sites More sharing options...
Recommended Posts