Jump to content

Crawled websites not inserting into my SQL table | PHP Web Crawler


ssscriptties
Go to solution Solved by requinix,

Recommended Posts

I've made a PHP web crawler and then made a MySQL table called "dex" as in index, then I connected to the database through PDO and tweaked the code to "INSERT" websites that aren't already crawled into the table, "UPDATE" for websites that are crawled, and used URL hashes as an indicator or "id" for links. The terminal shows all the links and links related to them, the if statement works perfectly and there are no major errors, so why does it not insert the data into the "dex" table? every-time I check the table after the process I only find the row that I inserted manually to test the if statement for "UPDATE" or "INSERT". what can I do to fix this issue and insert the date the crawler retrieves?

Test.html:

<a href="https://google.com"></a>
<a href="https://www.yahoo.com/"></a>
<a href="https://www.bing.com/"></a>
<a href="https://duckduckgo.com/"></a>

Crawler:

<?php

error_reporting(E_ALL);
ini_set('display_errors', 1);

$start = "http://localhost/deepsearch/test.html";

$pdo = new PDO('mysql:host=127.0.0.1;dbname=deepsearch', 'root', '');

$already_crawled = array();
$crawling = array();

function get_details($url) {
    $options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: howBot/0.1\n"));
    $context = stream_context_create($options);

    // Suppress warnings for HTML parsing errors
    libxml_use_internal_errors(true);

    $doc = new DOMDocument();
    @$html = @file_get_contents($url, false, $context);

    // Load HTML content and check for parsing errors
    if ($doc->loadHTML($html)) {
        if (!empty($titleElements)) {
            $title = $titleElements->item(0);
            $title = $title->nodeValue;
        } else {
            $title = "";
        }
        
        $description = "";
        $keywords = "";
        $metas = $doc->getElementsByTagName("meta");
        for ($i = 0; $i < $metas->length; $i++) {
            $meta = $metas->item($i);

            if ($meta->getAttribute("name") == strtolower("description")) {
                $description = $meta->getAttribute("content");
            }
            if ($meta->getAttribute("name") == strtolower("keywords")) {
                $keywords = $meta->getAttribute("content");
            }
        }

        return '{"Title": "'.str_replace("\n", "", $title).'", "Description": "'.str_replace("\n", "", $description).'", "Keywords": "'.str_replace("\n", "", $keywords).'", "URL": "'.$url.'"}';
    } else {
        // Handle the parsing error
        echo "HTML parsing error: " . libxml_get_last_error()->message . "\n";
        return ''; // Return an empty string or handle the error as needed
    }
}
function follow_links($url)
{

    global $pdo;
    global $already_crawled;
    global $crawling;

    $options = array('http' => array('method' => "GET", 'headers' => "User-Agent: howBot/0.1\n"));
    $context = stream_context_create($options);

    $doc = new DOMDocument();
    @$doc->loadHTML(@file_get_contents($url, false, $context));

    $linklist = $doc->getElementsByTagName("a");

    foreach ($linklist as $link) {
        $l = $link->getAttribute("href");

        if (substr($l, 0, 1) == "/" && substr($l, 0, 2) != "//") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . $l;
        } else if (substr($l, 0, 2) == "//") {
            $l = parse_url($url)["scheme"] . ":" . $l;
        } else if (substr($l, 0, 2) == "./") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . dirname(parse_url($url)["path"]) . substr($l, 1);
        } else if (substr($l, 0, 1) == "#") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . parse_url($url)["path"] . $l;
        } else if (substr($l, 0, 3) == "../") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . "/" . $l;
        } else if (substr($l, 0, 11) == "javascript:") {
            continue;
        } else if (substr($l, 0, 5) != "https" && substr($l, 0, 4) != "http") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . "/" . $l;
        }

        if (!in_array($l, $already_crawled)) {
            $already_crawled[] = $l;
            $crawling[] = $l;

            $details = json_decode(get_details($l));
            echo $details->URL . " ";

            $rows = $pdo->query("SELECT * FROM dex WHERE url_hash='" . md5($details->URL) . "'");
            $rows = $rows->fetchColumn();

            $params = array(':title' => $details->Title, ':description' => $details->Description, ':keywords' => $details->Keywords, ':url' => $details->URL, ':url_hash' => md5($details->URL));

            if ($rows > 0) {
                echo "UPDATE" . "\n";
            } else {

                if (!is_null($params[':title']) && !is_null($params[':description']) && $params[':title'] != '') {

                    $result = $pdo->prepare("INSERT INTO dex (title, description, keywords, url, url_hash) VALUES (:title, :description, :keywords, :url, :url_hash)");
                    $result= $result->execute($params); 


                    //if ($result) {
                    //    echo "Inserted successfully.\n";
                    //} else {
                    //    echo "Insertion failed.\n";
                    //    print_r($stmt->errorInfo());
                    //}
                }
            }


            //print_r($details)."\n";
            //echo get_details($l)."\n";
            //echo $l."\n";
        }

    }
    array_shift($crawling);
    foreach ($crawling as $site) {
        follow_links($site);
    }
}




follow_links($start);
//print_r($already_crawled);
?>

at first I tried different links that got me an empty value which resulted in errors and warnings then I changed the links and started writing the "UPDATE", "INSERT" if statement and started specifically writing the insert PDO first to test it out. when I executed the the file using command php I got the intended results in term of how it was supposed to look like in the terminal but then I checked on the table and found out that nothing was inserted. I want to insert these to use them in my search engine and make them searchable by query.

Link to comment
Share on other sites

13 hours ago, requinix said:
if (!empty($titleElements)) {

You didn't define $titleElements. Thus the $title is empty...

I tweaked the code a little,
 

function get_details($url) {
    $options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: howBot/0.1\n"));
    $context = stream_context_create($options);

    // Suppress warnings for HTML parsing errors
    libxml_use_internal_errors(true);

    $doc = new DOMDocument();
    @$doc = loadHTML(@file_get_contents($url, false, $context));

    // Load HTML content and check for parsing errors
        $title = $doc->getElementsByTagName("title");
        $title = $title->item(0)->nodeValue;
        
        $description = "";
        $keywords = "";
        $metas = $doc->getElementsByTagName("meta");
        for ($i = 0; $i < $metas->length; $i++) {
            $meta = $metas->item($i);

            if ($meta->getAttribute("name") == strtolower("description")) {
                $description = $meta->getAttribute("content");
            }
            if ($meta->getAttribute("name") == strtolower("keywords")) {
                $keywords = $meta->getAttribute("content");
            }
        }

        return '{"Title": "'.str_replace("\n", "", $title).'", "Description": "'.str_replace("\n", "", $description).'", "Keywords": "'.str_replace("\n", "", $keywords).'", "URL": "'.$url.'"}';
}

and got these errors instead:

PS C:\xampp\htdocs\deepsearch> php crawler_test-1.php
PHP Fatal error:  Uncaught Error: Call to undefined function loadHTML() in C:\xampp\htdocs\deepsearch\crawler_test-1.php:21
Stack trace:
#0 C:\xampp\htdocs\deepsearch\crawler_test-1.php(81): get_details()
#1 C:\xampp\htdocs\deepsearch\crawler_test-1.php(126): follow_links()
#2 {main}
  thrown in C:\xampp\htdocs\deepsearch\crawler_test-1.php on line 21

Fatal error: Uncaught Error: Call to undefined function loadHTML() in C:\xampp\htdocs\deepsearch\crawler_test-1.php:21
Stack trace:
#0 C:\xampp\htdocs\deepsearch\crawler_test-1.php(81): get_details()
#1 C:\xampp\htdocs\deepsearch\crawler_test-1.php(126): follow_links()
#2 {main}
  thrown in C:\xampp\htdocs\deepsearch\crawler_test-1.php on line 21
PS C:\xampp\htdocs\deepsearch>

 

Link to comment
Share on other sites

13 hours ago, requinix said:
if (!empty($titleElements)) {

You didn't define $titleElements. Thus the $title is empty...

Never mind I fixed it:

<?php

error_reporting(E_ALL);
ini_set('display_errors', 1);

$start = "http://localhost/deepsearch/test.html";

$pdo = new PDO('mysql:host=127.0.0.1;dbname=deepsearch', 'root', '');

$already_crawled = array();
$crawling = array();

function get_details($url) {
    $options = array('http' => array('method' => "GET", 'headers' => "User-Agent: howBot/0.1\n"));
    $context = stream_context_create($options);

    libxml_use_internal_errors(true);
    
    $doc = new DOMDocument();
    @$html = @file_get_contents($url, false, $context);

    if ($doc->loadHTML($html)) {
        $titleElements = $doc->getElementsByTagName("title");
        if (!empty($titleElements)) {
            $title = $titleElements->item(0)->nodeValue;
        } else {
            $title = "";
        }

        $description = "";
        $keywords = "";
        $metas = $doc->getElementsByTagName("meta");
        for ($i = 0; $i < $metas->length; $i++) {
            $meta = $metas->item($i);

            if ($meta->getAttribute("name") == strtolower("description")) {
                $description = $meta->getAttribute("content");
            }
            if ($meta->getAttribute("name") == strtolower("keywords")) {
                $keywords = $meta->getAttribute("content");
            }
        }

        return '{"Title": "'.str_replace("\n", "", $title).'", "Description": "'.str_replace("\n", "", $description).'", "Keywords": "'.str_replace("\n", "", $keywords).'", "URL": "'.$url.'"}';
    } else {
        echo "HTML parsing error: " . libxml_get_last_error()->message . "\n";
        return '';
    }
}
function follow_links($url)
{

    global $pdo;
    global $already_crawled;
    global $crawling;

    $options = array('http' => array('method' => "GET", 'headers' => "User-Agent: howBot/0.1\n"));
    $context = stream_context_create($options);

    $doc = new DOMDocument();
    @$doc->loadHTML(@file_get_contents($url, false, $context));

    $linklist = $doc->getElementsByTagName("a");

    foreach ($linklist as $link) {
        $l = $link->getAttribute("href");

        if (substr($l, 0, 1) == "/" && substr($l, 0, 2) != "//") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . $l;
        } else if (substr($l, 0, 2) == "//") {
            $l = parse_url($url)["scheme"] . ":" . $l;
        } else if (substr($l, 0, 2) == "./") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . dirname(parse_url($url)["path"]) . substr($l, 1);
        } else if (substr($l, 0, 1) == "#") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . parse_url($url)["path"] . $l;
        } else if (substr($l, 0, 3) == "../") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . "/" . $l;
        } else if (substr($l, 0, 11) == "javascript:") {
            continue;
        } else if (substr($l, 0, 5) != "https" && substr($l, 0, 4) != "http") {
            $l = parse_url($url)["scheme"] . "://" . parse_url($url)["host"] . "/" . $l;
        }

        if (!in_array($l, $already_crawled)) {
            $already_crawled[] = $l;
            $crawling[] = $l;

            $details = json_decode(get_details($l));
            echo $details->URL . " ";

            $rows = $pdo->query("SELECT * FROM dex WHERE url_hash='" . md5($details->URL) . "'");
            $rows = $rows->fetchColumn();

            $params = array(':title' => $details->Title, ':description' => $details->Description, ':keywords' => $details->Keywords, ':url' => $details->URL, ':url_hash' => md5($details->URL));

            if ($rows > 0) {
                echo "UPDATE" . "\n";
            } else {

                if (!is_null($params[':title']) && !is_null($params[':description']) && $params[':title'] != '') {

                    $result = $pdo->prepare("INSERT INTO dex VALUES ('', :title, :description, :keywords, :url, :url_hash)");
                    $result= $result->execute($params); 


                    //if ($result) {
                    //    echo "Inserted successfully.\n";
                    //} else {
                    //    echo "Insertion failed.\n";
                    //    print_r($stmt->errorInfo());
                    //}
                }
            }


            //print_r($details)."\n";
            //echo get_details($l)."\n";
            //echo $l."\n";
        }

    }
    array_shift($crawling);
    foreach ($crawling as $site) {
        follow_links($site);
    }
}




follow_links($start);
//print_r($already_crawled);
?>

thanks a lot @requinix, I was being an idiot and didn't notice

Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.