Jump to content

indexer


shivam0101

Recommended Posts

I am trying to write an indexer.  I am not getting all the links.  For example,

 

if page_1.php is the first page and has the bellow links,

 

echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>";

echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>";

 

and page_2.php has the below links,

echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>";

echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>";

echo "<a href='http://localhost/my_search/search/page_3.php'>c</a>";

 

and page_3.php has the below links,

echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>";

echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>";

 

I am not getting the links,

 

echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>";

echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>";

 

 

 

 

 

below is the code,

 

 

 

function getContents($url)

{

$file_path = getFullPath($url);

 

    return file_get_contents($file_path);

}

 

 

function getLinks($url)

{

  $fc = getContents($url);

  preg_match_all('/\s+href\s*=\s*[\"\']?([^\s\"\']+)[\"\'\s]+/ims', $fc, $links);

 

  return $links[1];

}

 

 

 

 

 

 

 

function getFullPath($url)

{

  $urlparts = parse_url($url);

  $path = $urlparts['path'];

  $host = $urlparts['host'];

 

/*

  if($scheme == '')

    $scheme = 'http://';

   

  if($host == '')

    $host = 'localhost/my_search/';

 

if ($urlparts['query'] != "")

$path .= "?".$urlparts['query'];

*/

return $url;

 

}

 

 

function getText($text)

{

//extracting page contetns from matching <body> </body> tag

$text = str_replace("<BODY","<body",$text);

$text = str_replace("</BODY>","</body>",$text);

$text = '<'.ltrim(ExtractString($text,'<body','</body>'));

$text = preg_replace("/</"," <",$text);

$text = preg_replace("/>/","> ",$text);

$text = preg_replace("/(\<script)(.*?)(script>)/si", " ", $text); //erasing scripts

$text = preg_replace("/(<a>(.*?)a>)/siU", " ", $text); //erasing links

$text = strip_tags($text);

$text = str_replace("<!--", "<!--", $text);

$text = preg_replace("/(\<)(.*?)(--\>)/mi", "".nl2br("\\2")."", $text);

 

while($text != strip_tags($text))

{

$text = strip_tags($text);

}

 

$text=ereg_replace(' '," ",$text);

$text = ereg_replace("[^[:alpha:].,']", " ", $text);

 

while(strpos($text,"  ")!==false)

{

$text=str_replace("  "," ",$text);

}

 

while(strpos($text,",,")!==false)

{

$text=str_replace(",,",",",$text);

}

 

while(strpos($text,"..")!==false)

{

$text=str_replace("..",".",$text);

}

 

return $text;

}

 

 

 

function ExtractString($str, $start, $end)

{

  $str_low = strtolower($str);

  $pos_start = strpos($str_low, $start);

  $pos_end = strpos($str_low, $end, ($pos_start + strlen($start)));

  if ( ($pos_start !== false) && ($pos_end !== false) )

  {

  $pos1 = $pos_start + strlen($start);

  $pos2 = $pos_end - $pos1;

  return substr($str, $pos1, $pos2);

  }

  return false;

}

 

 

function saveKeywords($id, $text)

{

$exp_text = explode(' ', $text);

 

foreach($exp_text as $v)

{

if(!empty($v))

$insert_keyword = mysql_query("INSERT INTO keywords SET url_id=$id, keyword='$v'");

}

}

 

 

 

function indexSite($site_url)

{

$links = getLinks($site_url);

 

foreach($links as $link => $linkv)

{

$linkv = getFullPath($linkv);

$fp = file_get_contents($linkv);

$md5 = md5($fp);

 

eregi("<title>(.+)</title>", $fp, $regs);

$title = $regs[1];

 

$text = getText($fp);

$date = date('Y-m-d');

 

 

$check_res = mysql_query("SELECT * FROM url WHERE url='$linkv'");

$check_num_rows = mysql_num_rows($check_res);

 

if($check_num_rows > 0)

{

$get_md5_res = mysql_query("SELECT md5 FROM url WHERE url='$linkv'");

$get_md5_ret = mysql_fetch_assoc($get_md5_res);

$existing_md5 = $get_md5_ret['md5'];

 

if($existing_md5 !=$md5)

$process = 'update';

}

else

{

$process = 'insert';

}

 

 

if($process == 'insert')

{

$insert_site_details = mysql_query("INSERT INTO url SET url='$linkv', title='$title', full_text='$text', indexed_date='$date', md5='$md5'");

$insert_id = mysql_insert_id();

 

echo "INSERTING $linkv <br/>";

 

$insert_keywords = saveKeywords($insert_id, $text);

}

elseif($process == 'update')

{

$update_site_details = mysql_query("UPDATE url SET title='$title', full_text='$text', indexed_date='$date', md5='$md5' WHERE url='$linkv'");

$insert_keywords = saveKeywords($insert_id, $text);

}

 

}

}

 

 

 

$links = getLinks('http://localhost/my_search/search/page_1.php');

 

 

foreach($links as $link)

{

indexSite($link);

}

Link to comment
https://forums.phpfreaks.com/topic/129679-indexer/
Share on other sites

I am trying to write an indexer.  I am not getting all the links.  For example,

 

if page_1.php is the first page and has the bellow links,

 

echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>";
echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>";

 

and page_2.php has the below links,

echo "<a href='http://localhost/my_search/search/page_1.php'>a</a>";
echo "<a href='http://localhost/my_search/search/page_2.php'>b</a>";
echo "<a href='http://localhost/my_search/search/page_3.php'>c</a>";

 

and page_3.php has the below links,

echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>";
echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>";

 

I am not getting the links,

 

echo "<a href='http://localhost/my_search/search/page_4.php'>b</a>";
echo "<a href='http://localhost/my_search/search/page_5.php'>c</a>";

 

 

 

 

 

below is the code,

 

 

 

function getContents($url)
{
   $file_path = getFullPath($url);

    return file_get_contents($file_path);
}


function getLinks($url)
{
  $fc = getContents($url);
  preg_match_all('/\s+href\s*=\s*[\"\']?([^\s\"\']+)[\"\'\s]+/ims', $fc, $links);

  return $links[1];
}







function getFullPath($url)
{
   $urlparts = parse_url($url);
   $path = $urlparts['path'];
   $host = $urlparts['host'];
   
/*
   if($scheme == '')
        $scheme = 'http://';
   
   if($host == '')
        $host = 'localhost/my_search/';
    
   if ($urlparts['query'] != "")
      $path .= "?".$urlparts['query'];
*/      
   return $url;

}


   function getText($text)
   {   
      //extracting page contetns from matching <body> </body> tag
      $text = str_replace("<BODY","<body",$text);
      $text = str_replace("</BODY>","</body>",$text);
      $text = '<'.ltrim(ExtractString($text,'<body','</body>'));            
      $text = preg_replace("/</"," <",$text);
      $text = preg_replace("/>/","> ",$text);      
      $text = preg_replace("/(\<script)(.*?)(script>)/si", " ", $text); //erasing scripts      
      $text = preg_replace("/(<a>(.*?)a>)/siU", " ", $text); //erasing links                
      $text = strip_tags($text);
      $text = str_replace("<!--", "<!--", $text);
      $text = preg_replace("/(\<)(.*?)(--\>)/mi", "".nl2br("\\2")."", $text);
      
      while($text != strip_tags($text))
      {
         $text = strip_tags($text);
      }
      
      $text=ereg_replace(' '," ",$text);
      $text = ereg_replace("[^[:alpha:].,']", " ", $text);
      
      while(strpos($text,"  ")!==false)
      {
         $text=str_replace("  "," ",$text);
      }
      
      while(strpos($text,",,")!==false)
      {
         $text=str_replace(",,",",",$text);
      }
      
      while(strpos($text,"..")!==false)
      {
         $text=str_replace("..",".",$text);
      }
      
      return $text;
   }



   function ExtractString($str, $start, $end)
   {
      $str_low = strtolower($str);
      $pos_start = strpos($str_low, $start);
      $pos_end = strpos($str_low, $end, ($pos_start + strlen($start)));
      if ( ($pos_start !== false) && ($pos_end !== false) )
      {
         $pos1 = $pos_start + strlen($start);
         $pos2 = $pos_end - $pos1;
         return substr($str, $pos1, $pos2);
      }
      return false;
   }
   
   
   function saveKeywords($id, $text)
   {
      $exp_text = explode(' ', $text);
      
      foreach($exp_text as $v)
      {
         if(!empty($v))
            $insert_keyword = mysql_query("INSERT INTO keywords SET url_id=$id, keyword='$v'");
      }   
   }



function indexSite($site_url)
{
   $links = getLinks($site_url);
   
   foreach($links as $link => $linkv)
   {
      $linkv = getFullPath($linkv);
      $fp = file_get_contents($linkv);
      $md5 = md5($fp);
      
      eregi("<title>(.+)</title>", $fp, $regs);
      $title = $regs[1];

      $text = getText($fp);
      $date = date('Y-m-d');
      
      
      $check_res = mysql_query("SELECT * FROM url WHERE url='$linkv'");
      $check_num_rows = mysql_num_rows($check_res);
      
      if($check_num_rows > 0)
      {
         $get_md5_res = mysql_query("SELECT md5 FROM url WHERE url='$linkv'");
         $get_md5_ret = mysql_fetch_assoc($get_md5_res);
         $existing_md5 = $get_md5_ret['md5'];
         
         if($existing_md5 !=$md5)
            $process = 'update';
      }
      else
      {
            $process = 'insert';
      }
         
      
      if($process == 'insert')
      {         
         $insert_site_details = mysql_query("INSERT INTO url SET url='$linkv', title='$title', full_text='$text', indexed_date='$date', md5='$md5'");
         $insert_id = mysql_insert_id();
         
         echo "INSERTING $linkv <br/>";

         $insert_keywords = saveKeywords($insert_id, $text);
      }
      elseif($process == 'update')
      {
         $update_site_details = mysql_query("UPDATE url SET title='$title', full_text='$text', indexed_date='$date', md5='$md5' WHERE url='$linkv'");
         $insert_keywords = saveKeywords($insert_id, $text);
      }

    }
}



$links = getLinks('http://localhost/my_search/search/page_1.php');


foreach($links as $link)
{
   indexSite($link);
} 

Link to comment
https://forums.phpfreaks.com/topic/129679-indexer/#findComment-672416
Share on other sites

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.