shivam0101 Posted October 23, 2008 Share Posted October 23, 2008 I am trying to write an indexer. I am not getting all the links. For example, if page_1.php is the first page and has the bellow links, echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>"; echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>"; and page_2.php has the below links, echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>"; echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>"; echo "<a href='http://localhost/my_search/search/page_3.php'>c</a>"; and page_3.php has the below links, echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>"; echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>"; I am not getting the links, echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>"; echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>"; below is the code, function getContents($url) { $file_path = getFullPath($url); return file_get_contents($file_path); } function getLinks($url) { $fc = getContents($url); preg_match_all('/\s+href\s*=\s*[\"\']?([^\s\"\']+)[\"\'\s]+/ims', $fc, $links); return $links[1]; } function getFullPath($url) { $urlparts = parse_url($url); $path = $urlparts['path']; $host = $urlparts['host']; /* if($scheme == '') $scheme = 'http://'; if($host == '') $host = 'localhost/my_search/'; if ($urlparts['query'] != "") $path .= "?".$urlparts['query']; */ return $url; } function getText($text) { //extracting page contetns from matching <body> </body> tag $text = str_replace("<BODY","<body",$text); $text = str_replace("</BODY>","</body>",$text); $text = '<'.ltrim(ExtractString($text,'<body','</body>')); $text = preg_replace("/</"," <",$text); $text = preg_replace("/>/","> ",$text); $text = preg_replace("/(\<script)(.*?)(script>)/si", " ", $text); //erasing scripts $text = preg_replace("/(<a>(.*?)a>)/siU", " ", $text); //erasing links $text = strip_tags($text); $text = str_replace("<!--", "<!--", $text); $text = preg_replace("/(\<)(.*?)(--\>)/mi", "".nl2br("\\2")."", $text); while($text != strip_tags($text)) { $text = strip_tags($text); } $text=ereg_replace(' '," ",$text); $text = ereg_replace("[^[:alpha:].,']", " ", $text); while(strpos($text," ")!==false) { $text=str_replace(" "," ",$text); } while(strpos($text,",,")!==false) { $text=str_replace(",,",",",$text); } while(strpos($text,"..")!==false) { $text=str_replace("..",".",$text); } return $text; } function ExtractString($str, $start, $end) { $str_low = strtolower($str); $pos_start = strpos($str_low, $start); $pos_end = strpos($str_low, $end, ($pos_start + strlen($start))); if ( ($pos_start !== false) && ($pos_end !== false) ) { $pos1 = $pos_start + strlen($start); $pos2 = $pos_end - $pos1; return substr($str, $pos1, $pos2); } return false; } function saveKeywords($id, $text) { $exp_text = explode(' ', $text); foreach($exp_text as $v) { if(!empty($v)) $insert_keyword = mysql_query("INSERT INTO keywords SET url_id=$id, keyword='$v'"); } } function indexSite($site_url) { $links = getLinks($site_url); foreach($links as $link => $linkv) { $linkv = getFullPath($linkv); $fp = file_get_contents($linkv); $md5 = md5($fp); eregi("<title>(.+)</title>", $fp, $regs); $title = $regs[1]; $text = getText($fp); $date = date('Y-m-d'); $check_res = mysql_query("SELECT * FROM url WHERE url='$linkv'"); $check_num_rows = mysql_num_rows($check_res); if($check_num_rows > 0) { $get_md5_res = mysql_query("SELECT md5 FROM url WHERE url='$linkv'"); $get_md5_ret = mysql_fetch_assoc($get_md5_res); $existing_md5 = $get_md5_ret['md5']; if($existing_md5 !=$md5) $process = 'update'; } else { $process = 'insert'; } if($process == 'insert') { $insert_site_details = mysql_query("INSERT INTO url SET url='$linkv', title='$title', full_text='$text', indexed_date='$date', md5='$md5'"); $insert_id = mysql_insert_id(); echo "INSERTING $linkv <br/>"; $insert_keywords = saveKeywords($insert_id, $text); } elseif($process == 'update') { $update_site_details = mysql_query("UPDATE url SET title='$title', full_text='$text', indexed_date='$date', md5='$md5' WHERE url='$linkv'"); $insert_keywords = saveKeywords($insert_id, $text); } } } $links = getLinks('http://localhost/my_search/search/page_1.php'); foreach($links as $link) { indexSite($link); } Link to comment https://forums.phpfreaks.com/topic/129679-indexer/ Share on other sites More sharing options...
trq Posted October 23, 2008 Share Posted October 23, 2008 Please, we have tags for a reason. Link to comment https://forums.phpfreaks.com/topic/129679-indexer/#findComment-672361 Share on other sites More sharing options...
shivam0101 Posted October 23, 2008 Author Share Posted October 23, 2008 I am trying to write an indexer. I am not getting all the links. For example, if page_1.php is the first page and has the bellow links, echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>"; echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>"; and page_2.php has the below links, echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>"; echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>"; echo "<a href='http://localhost/my_search/search/page_3.php'>c</a>"; and page_3.php has the below links, echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>"; echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>"; I am not getting the links, echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>"; echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>"; below is the code, function getContents($url) { $file_path = getFullPath($url); return file_get_contents($file_path); } function getLinks($url) { $fc = getContents($url); preg_match_all('/\s+href\s*=\s*[\"\']?([^\s\"\']+)[\"\'\s]+/ims', $fc, $links); return $links[1]; } function getFullPath($url) { $urlparts = parse_url($url); $path = $urlparts['path']; $host = $urlparts['host']; /* if($scheme == '') $scheme = 'http://'; if($host == '') $host = 'localhost/my_search/'; if ($urlparts['query'] != "") $path .= "?".$urlparts['query']; */ return $url; } function getText($text) { //extracting page contetns from matching <body> </body> tag $text = str_replace("<BODY","<body",$text); $text = str_replace("</BODY>","</body>",$text); $text = '<'.ltrim(ExtractString($text,'<body','</body>')); $text = preg_replace("/</"," <",$text); $text = preg_replace("/>/","> ",$text); $text = preg_replace("/(\<script)(.*?)(script>)/si", " ", $text); //erasing scripts $text = preg_replace("/(<a>(.*?)a>)/siU", " ", $text); //erasing links $text = strip_tags($text); $text = str_replace("<!--", "<!--", $text); $text = preg_replace("/(\<)(.*?)(--\>)/mi", "".nl2br("\\2")."", $text); while($text != strip_tags($text)) { $text = strip_tags($text); } $text=ereg_replace(' '," ",$text); $text = ereg_replace("[^[:alpha:].,']", " ", $text); while(strpos($text," ")!==false) { $text=str_replace(" "," ",$text); } while(strpos($text,",,")!==false) { $text=str_replace(",,",",",$text); } while(strpos($text,"..")!==false) { $text=str_replace("..",".",$text); } return $text; } function ExtractString($str, $start, $end) { $str_low = strtolower($str); $pos_start = strpos($str_low, $start); $pos_end = strpos($str_low, $end, ($pos_start + strlen($start))); if ( ($pos_start !== false) && ($pos_end !== false) ) { $pos1 = $pos_start + strlen($start); $pos2 = $pos_end - $pos1; return substr($str, $pos1, $pos2); } return false; } function saveKeywords($id, $text) { $exp_text = explode(' ', $text); foreach($exp_text as $v) { if(!empty($v)) $insert_keyword = mysql_query("INSERT INTO keywords SET url_id=$id, keyword='$v'"); } } function indexSite($site_url) { $links = getLinks($site_url); foreach($links as $link => $linkv) { $linkv = getFullPath($linkv); $fp = file_get_contents($linkv); $md5 = md5($fp); eregi("<title>(.+)</title>", $fp, $regs); $title = $regs[1]; $text = getText($fp); $date = date('Y-m-d'); $check_res = mysql_query("SELECT * FROM url WHERE url='$linkv'"); $check_num_rows = mysql_num_rows($check_res); if($check_num_rows > 0) { $get_md5_res = mysql_query("SELECT md5 FROM url WHERE url='$linkv'"); $get_md5_ret = mysql_fetch_assoc($get_md5_res); $existing_md5 = $get_md5_ret['md5']; if($existing_md5 !=$md5) $process = 'update'; } else { $process = 'insert'; } if($process == 'insert') { $insert_site_details = mysql_query("INSERT INTO url SET url='$linkv', title='$title', full_text='$text', indexed_date='$date', md5='$md5'"); $insert_id = mysql_insert_id(); echo "INSERTING $linkv <br/>"; $insert_keywords = saveKeywords($insert_id, $text); } elseif($process == 'update') { $update_site_details = mysql_query("UPDATE url SET title='$title', full_text='$text', indexed_date='$date', md5='$md5' WHERE url='$linkv'"); $insert_keywords = saveKeywords($insert_id, $text); } } } $links = getLinks('http://localhost/my_search/search/page_1.php'); foreach($links as $link) { indexSite($link); } Link to comment https://forums.phpfreaks.com/topic/129679-indexer/#findComment-672416 Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.