Jump to content

Batfan

New Members
  • Posts

    7
  • Joined

  • Last visited

    Never

Profile Information

  • Gender
    Not Telling

Batfan's Achievements

Newbie

Newbie (1/5)

0

Reputation

  1. Sure. What variables should I be printing?
  2. Sure, so using lipsum.com as a test, here is the result: [1] - http://lipsum.com - Match Percentage:67.4% [2] - http://hy.lipsum.com/ - Match Percentage:60.14% [3] - http://sq.lipsum.com/ - Match Percentage:63.54% [4] - http://ar.lipsum.com/ - Match Percentage:57.79% [5] - http://bg.lipsum.com/ - Match Percentage:61.2% [6] - http://ca.lipsum.com/ - Match Percentage:65.17% [7] - http://hr.lipsum.com/ - Match Percentage:65.65% [8] - http://cs.lipsum.com/ - Match Percentage:64.63% [9] - http://da.lipsum.com/ - Match Percentage:65.17% [10] - http://nl.lipsum.com/
  3. Right. The default is set @ 25 but, it only lists 10. It should be doing 25 (or whatever the max is). Its consistent
  4. I did not increase both but, the default "links to crawl" value is 25
  5. But wouldn't changing the link depth in the form fix that? I've tried selecting a higher link depth and it still stops after 10
  6. First and foremost, I am a PHP novice so, if there's better or more efficient way of doing what I'm trying to do, please feel free to point it out I came across an old PHP script that was used to crawl a site and check the response code on the pages found. I have modified it to do a duplicate content check. It's using the similar_text function to compare 1 page's content (specified by the user) against the content of each page it finds. It's a little slow but, its working. The only problem that I'm having is that it stops after about the first 10 links and I can't figure out why. Any help is greatly appreciated. <form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post"> <div class="row"><label for="page1" class="small label"><strong>Page? </strong>: </label><input type="text" name="page1" id="page1" value="" size="40" /></div> <div class="row"><label for="url" class="small label"><strong>Please Enter URL </strong>: </label><input type="text" name="url" id="url" value="" size="40" /></div> <div class="row"><label for="maxlinks" class="small label"><strong>Number of links to get </strong>: </label><input type="text" name="maxlinks" id="maxlinks" value="25" size="3" maxlength="3" /></div> <div class="row"><label for="linkdepth" class="small label"><strong>Links Maximum depth</strong> : </label> <select name="linkdepth" id="linkdepth" ><option value="1">1</option> <option value="2" selected="selected">2</option> <option value="3">3</option> <option value="4">4</option> <option value="5">5</option> <option value="6">6</option> </select></div> <input type="submit" name="submit" style="font-weight: bold" value="Check links" id="submit" /> </form> <?php if (isset($_POST['submit'])){ $page1 = ($_POST['page1']); $baseurl = ($_POST['url']); $pages = array(); $i=($_POST['linkdepth']); $maxlinks = (integer)$_POST['maxlinks']; $domain= extract_domain_name($baseurl); echo '<p class="small">Extracted domain name: <strong>'.$domain.'</strong>. '; echo 'Maximum depth: <strong>'.$i.'</strong></p>'; function get_urls($page){ global $domain, $i; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $page); curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, true); /* Spoof the User-Agent header value; just to be safe */ curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'); /* I set timeout values for the connection and download because I don't want my script to get stuck downloading huge files or trying to connect to a nonresponsive server. These are optional. */ curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100); curl_setopt($ch, CURLOPT_TIMEOUT, 100); /* This ensures 404 Not Found (and similar) will be treated as errors */ curl_setopt($ch, CURLOPT_FAILONERROR, 0); /* Download the page */ $html = curl_exec($ch); /* in case of an error*/ if(curl_exec($ch) === false) { echo '<p class="small">Error. Please check URL: <strong style="color:#ae3100">' . curl_error($ch).'</p></strong>'; } curl_close($ch); if(!$html) return false; /* Extract the BASE tag (if present) for relative-to-absolute URL conversions later */ if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){ $base_url=$matches[1]; echo $base_url; } else { $base_url=$page; //base url = strani4ka s kotoroy na4inaetsa novaja porverka } $links=array(); $html = str_replace("\n", ' ', $html); preg_match_all('/<a[\s]+[^>]*href\s*=\s*[\"\']?([^\'\" >]+)[\'\" >]/i', $html, $m); /* this regexp is a combination of numerous versions I saw online*/ foreach($m[1] as $url) { $url=trim($url); /* get rid of PHPSESSID, #linkname, & and javascript: */ $url=preg_replace( array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i'), array('','','&',''), $url); /* turn relative URLs into absolute URLs. relative2absolute() is defined further down below on this page. */ $url = relative2absolute($base_url, $url); // check if in the same (sub-)$domain if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url)) { $depth= substr_count($url, "/")-2 ; /* Counts slashes in URL Responsible for link depth */ if ($depth <= $i){ if(!in_array($url, $links, check)) $links[]=$url; } } } return $links; } // Functions to crawl the next page function next_page(){ global $pages; $k=0; foreach( array_keys($pages) as $k=> $page){ if($pages[$page] == NULL){ $k++; echo "[$k] - "; return $page; } } return NULL; } function add_urls($page){ // ads new unique urls in to array and checks each url for Server Header Status global $pages, $maxlinks; $start = microtime(); $urls = get_urls($page); $resptime = microtime() - $start; // with microtime it is possible to find out on which page the crowler stops responding. //Start checking for Server Header $ch = curl_init($page); curl_setopt($ch, CURLOPT_NOBODY, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // Execute curl_exec($ch); $info = curl_getinfo($ch); print "$page"; // If the status code os 200, then print OK, else = NO // if($info['http_code']==200) { $page1 = ($_POST['page1']); $page1data = file_get_contents($page1); $page2 = file_get_contents($page); $i = similar_text($page1data, $page2, $p); $p = round($p, 2); echo ' - Match Percentage:' . $p . '%'; // } else { // echo '<strong style="color:#ba3d00"> NO </strong>';} /* echo substr(($resptime),0,5). " seconds"; */ // Activate ths to see how much time it takes to crawl echo '<br/>'; curl_close($ch); // Close handle $pages[$page] = array ('resptime' => floor($resptime * 9000), 'url' => $page); foreach($urls as $url){ if(!array_key_exists($url, $pages) && !in_array($url, $pages) && count($pages)<$maxlinks){ $pages[$url] = NULL; } } } echo '[1] - '; // this is for the first input url, as it will be extracted from input add_urls($baseurl); while(($page= next_page()) != NULL ) //while there are urls available { add_urls($page); } echo '<p class="small">Amount of crawled links: <strong>'.count ($pages).'</strong></p>'; if (count($pages)<$maxlinks) echo '<p class="small">Sorry, no more links to crawl!!</p>';// count all extracted Urls } ?><?php function extract_domain_name($url){ /* old domain extractor if(preg_match('@^(?:http:\/\/)?([^\/]+)@i', $url, $matches)) { return trim(strtolower($matches[1])); } else { return ''; }*/ preg_match("/^(http:\/\/)?([^\/]+)/i", $url, $matches); $host = $matches[2]; // get last two segments of host name preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches); return $matches[0]; } function relative2absolute($absolute, $relative) { $p = parse_url($relative); if($p["scheme"])return $relative; extract(parse_url($absolute)); $path = dirname($path); if($relative{0} == '/') { $newPath = array_filter(explode("/", $relative)); } else { $aparts = array_filter(explode("/", $path)); $rparts = array_filter(explode("/", $relative)); $cparts = array_merge($aparts, $rparts); $k = 0; $newPath = array(); foreach($cparts as $i => $part) { if($part == '..') { $k = $k - 1; $newPath[$k] = null; } else { $newPath[$k] = $cparts[$i]; $k = $k + 1; } } $newPath = array_filter($newPath); } $path = implode("/", $newPath); $url = ""; if($scheme) { $url = "$scheme://"; } if($user) { $url .= "$user"; if($pass) { $url .= ":$pass"; } $url .= "@"; } if($host) { $url .= "$host/"; } $url .= $path; return $url; } ################################################## ?>
×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.