php_brute Posted May 20, 2023 Share Posted May 20, 2023 (edited) DESCRIPTION: Error gets shown here: $dom->loadXML($xml); //LINE: 44 RESULT: I get echoed: **( ! ) Warning: DOMDocument::loadXML(): Start tag expected, '<' not found in Entity, line: 6 in C:\wamp64... on line 44 Call Stack # Time Memory Function Location 1 0.0034 362440 {main}( ) ...\crawler_Test.php:0 2 14.0745 365456 loadXML( $source = class SimpleXMLElement { public $sitemap = [0 => class SimpleXMLElement { ... }, 1 => class SimpleXMLElement { ... }, 2 => class SimpleXMLElement { ... }, 3 => class SimpleXMLElement { ... }] } ) 46 73 SiteMaps Crawled: --- Array ( ) Html Pages Crawled: --- Array ( ) Array ( ) Array ( ) Array ( ) 50 Array ( ) Array ( ) Array ( ) ** CODE <?php ini_set('display_errors',1); ini_set('display_startup_errors',1); error_reporting(E_ALL); //START OF SCRIPT FLOW. //Preparing Crawler & Session: Initialising Variables. //Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only. //SiteMaps Details Scraped from SiteMaps or Xml Files. $sitemaps = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml). $sitemaps_last_mods = []; //This will list dates of SiteMap pages last modified - found on Sitemaps. $sitemaps_change_freqs = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps. $sitemaps_priorities = []; //This will list SiteMap pages priorities - found on Sitemaps. //Webpage Details Scraped from SiteMaps or Xml Files. $html_page_urls = []; //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml). $html_page_last_mods = []; //This will list dates of html pages last modified - found on Sitemap. $html_page_change_freqs = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps. $html_page_priorities = []; //This will list html pages priorities - found on Sitemaps. //Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only. //Data Scraped from Html Files. Not Xml SiteMap Files. $html_page_meta_names = []; //This will list crawled pages Meta Tag Names - found on html pages. $html_page_meta_descriptions = []; //This will list crawled pages Meta Tag Descriptions - found on html pages. $html_page_titles = []; //This will list crawled pages Titles - found on html pages. // ----- //Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point. //Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml SItemap). $initial_url = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files. //$xmls = file_get_contents($initial_url); //Should I stick to this line or below line ? //Parse the sitemap content to object //$xml = simplexml_load_string($xmls); //Should I stick to this line or above line ? $xml = simplexml_load_string(file_get_contents($initial_url)); //Code from Dani: https://www.daniweb.com/programming/web-development/threads/540168/what-to-lookout-for-to-prevent-crawler-traps $dom = new DOMDocument(); $dom->loadXML($xml); //LINE: 44 echo __LINE__; echo '<br>'; //LINE: 46 extract_links($xml); echo __LINE__; echo '<br>'; //LINE: 50 foreach($sitemaps AS $sitemap) { echo __LINE__; echo '<br>'; extract_links($sitemap); //Extract Links on page. } foreach($html_page_urls AS $html_page_url) { echo __LINE__; echo '<br>'; extract_links($html_page_url); //Extract Links on page. } scrape_page_data(); //Scrape Page Title & Meta Tags. //END OF SCRIPT FLOW. //DUNCTIONS BEYOND THIS POINT. //Links Extractor. function extract_links() { echo __LINE__; echo '<br>'; //LINE: 73 GLOBAL $dom; //Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.). if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links. { echo __LINE__; echo '<br>'; //parse the index // retrieve properties from the sitemap object foreach ($xml->sitemapindex as $urlElement) //Extracts html file urls. { // get properties $sitemaps[] = $sitemap_url = $urlElement->loc; $sitemaps_last_mods[] = $last_mod = $urlElement->lastmod; $sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq; $sitemaps_priorities[] = $priority = $urlElement->priority; // print out the properties echo 'url: '. $sitemap_url . '<br>'; echo 'lastmod: '. $last_mod . '<br>'; echo 'changefreq: '. $change_freq . '<br>'; echo 'priority: '. $priority . '<br>'; echo '<br>---<br>'; } } else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links. { echo __LINE__; echo '<br>'; //parse url set // retrieve properties from the sitemap object foreach ($xml->urlset as $urlElement) //Extracts Sitemap Urls. { // get properties $html_page_urls[] = $html_page_url = $urlElement->loc; $html_page_last_mods[] = $last_mod = $urlElement->lastmod; $html_page_change_freqs[] = $change_freq = $urlElement->changefreq; $html_page_priorities[] = $priority = $urlElement->priority; // print out the properties echo 'url: '. $html_page_url . '<br>'; echo 'lastmod: '. $last_mod . '<br>'; echo 'changefreq: '. $change_freq . '<br>'; echo 'priority: '. $priority . '<br>'; echo '<br>---<br>'; } } GLOBAL $sitemaps; GLOBAL $sitemaps_last_mods; GLOBAL $sitemaps_change_freqs; GLOBAL $sitemaps_priorities; GLOBAL $html_page_urls; GLOBAL $html_page_last_mods; GLOBAL $html_page_change_freqs; GLOBAL $html_page_priorities; echo 'SiteMaps Crawled: ---'; echo '<br><br>'; if(array_count_values($sitemaps)>0) { print_r($sitemaps); echo '<br>'; } elseif(array_count_values($sitemaps_last_mods)>0) { print_r($sitemaps_last_mods); echo '<br>'; } elseif(array_count_values($sitemaps_change_freqs)>0) { print_r($sitemaps_change_freqs); echo '<br>'; } elseif(array_count_values($sitemaps_priorities)>0) { print_r($sitemaps_priorities); echo '<br><br>'; } echo 'Html Pages Crawled: ---'; echo '<br><br>'; if(array_count_values($html_page_urls)>0) { print_r($html_page_urls); echo '<br>'; } if(array_count_values($html_page_last_mods)>0) { print_r($html_page_last_mods); echo '<br>'; } if(array_count_values($html_page_change_freqs)>0) { print_r($html_page_change_freqs); echo '<br>'; } if(array_count_values($html_page_priorities)>0) { print_r($html_page_priorities); echo '<br>'; } } //Meta Data & Title Extractor. function scrape_page_data() { GLOBAL $html_page_urls; if(array_count_values($html_page_urls)>0) { foreach($html_page_urls AS $url) { // https://www.php.net/manual/en/function.file-get-contents $html = file_get_contents($url); //https://www.php.net/manual/en/domdocument.construct.php $doc = new DOMDocument(); // https://www.php.net/manual/en/function.libxml-use-internal-errors.php libxml_use_internal_errors(true); // https://www.php.net/manual/en/domdocument.loadhtml.php $doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING); // https://www.php.net/manual/en/function.libxml-clear-errors.php libxml_clear_errors(); // https://www.php.net/manual/en/domdocument.getelementsbytagname.php $meta_tags = $doc->getElementsByTagName('meta'); // https://www.php.net/manual/en/domnodelist.item.php if ($meta_tags->length > 0) { // https://www.php.net/manual/en/class.domnodelist.php foreach ($meta_tags as $tag) { // https://www.php.net/manual/en/domnodelist.item.php echo 'Meta Name: ' .$meta_name = $tag->getAttribute('name'); echo '<br>'; echo 'Meta Content: ' .$meta_content = $tag->getAttribute('content'); echo '<br>'; $html_page_meta_names[] = $meta_name; $html_page_meta_descriptions[] = $meta_content; } } //EXAMPLE 1: Extract Title $title_tag = $doc->getElementsByTagName('title'); if ($title_tag->length>0) { echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>'; $html_page_titles[] = $title; } //EXAMPLE 2: Extract Title $title_tag = $doc->getElementsByTagName('title'); for ($i = 0; $i < $title_tag->length; $i++) { echo 'Title: ' .$title = $title_tag->item($i)->nodeValue . "\n"; $html_page_titles[] = $title; } } } } if(array_count_values($html_page_meta_names)>0) { print_r($html_page_meta_names); echo '<br>'; } if(array_count_values($html_page_meta_descriptions)>0) { print_r($html_page_meta_descriptions); echo '<br>'; } if(array_count_values($html_page_titles)>0) { print_r($html_page_titles); echo '<br>'; } //END OF FUNCTIONS. ?> I can’t afford to get errors like this as not all websites will be code error free. I want to suppress such error that gets shown due to coding errors on webpages crawled on third party websites. How to do this ? NOTE: I should only get error if my own code (crawler is coded in error) has errors. I do not know how to suppress the error. I would like to know if there are any error on my coding that could cause issues while crawling the web or later on at any point. Regards! Edited May 20, 2023 by php_brute Link to comment Share on other sites More sharing options...
Recommended Posts