Jump to content

Parse large XML files!


Awilum

Recommended Posts

I need to parse large XML files ranging in size from ~ 500 to ~ 1700 Mb.

 

I use XMLReader

 

	
        set_time_limit(0);

$start_time = microtime(true);


include_once 'inc/Misc.php';
include_once 'inc/Database.php';

$files = array('xml/large_file.xml');



foreach($files as $file) {

	echo "\n";		
	echo 'Filename: '.basename($file)."\n";
	echo 'Filesize: '.convert(filesize($file))."\n";
	echo 'Start parsing...'."\n";
	echo "\n";

	$reader = new XMLReader();

	$reader->open($file);		


	while ($reader->read()) {
	    switch ($reader->nodeType) {
	        case (XMLREADER::ELEMENT):
	        if ($reader->localName == "element-name") {		                
	                $dom = new DomDocument();
	                $n = $dom->importNode($reader->expand(),true);
	                $dom->appendChild($n);
	                $sxe = simplexml_import_dom($n);   
	                $tess->file_big->insert($sxe);          
	                echo "Insert done! "; benchmark();		                
	        }		        
	    }		    
	}

}

 

Everything is fine in the beginning ...

Parsed file and slowly inserted my desired data, but is gradually growing memory consumption and has run out of resources.

 

That is, I took the file to 400 Mb and as long as it is spent parsing of 2000 Mb of RAM and all the resources ran out and the script is stopped.

 

How to deal with large files? ~ 500 to ~ 1700 Mb.

 

Will there XML Parser? Yes, and how to apply it to my problem?

 

Another option could have?

Link to comment
Share on other sites

I rewrote the script. Frees memory as I can. For data storage use MongoDB.

General problem that runs the memory remains. But this should not be with XMLReader i think.

 

xmlreader.php

<?php

$start_time = microtime(true);

include 'inc/Misc.php';

logAdd('Script start');

set_time_limit(0);

 // Try to enable garbage collection on 5.3+
	if (function_exists('gc_enable') && !gc_enabled()) { 		
    	gc_enable();
	}
  
  	//$mongo = new Mongo();
  	$db = 'tess';
  	$collection = 'apc';
  	
  
	$files = array(/*'xml/apc101231-42.xml',*/
				   /*'xml/apc110101.xml'*/ 'xml/apc110101.xml');
  
 foreach($files as $file) {
     echo "\n";
     echo 'Filename: '.basename($file)."\n";
     echo 'Start parsing...'."\n";
     echo "\n";


     $reader = new XMLReader();
     
     $reader->open($file);

	 logAdd('Srart parsing');		         

     while ($reader->read()) {	     	 
         switch ($reader->nodeType) {         	
             case (XMLREADER::ELEMENT):	             	 
                 if ($reader->localName == "case-file") {
                 	 
                 	 logAdd('case-file found');
                 	 
                     $dom = new DomDocument();
                     $n = $dom->importNode($reader->expand(),true);
                     $dom->appendChild($n);
                     $sxe = simplexml_import_dom($n);
                     
                     logAdd('case-file in $sxe');
		   		             
		   		     // Insert data!        	
	                 //$mongo->$db->$collection->insert($sxe);	                 		
	                 
	                 logAdd('Insert done!');

                     //print_r($sxe);
                     echo "Insert done! \n"; 
                     
                     // Now clear the memory.
                     unset($n, $dom, $sxe); 
                                                              
                     logAdd('Clear the memory');
                 }
             break;
         }	         
         logAdd('case-file in $sxe');
     }
     
     // Close the resource
     $reader->close();
     
     // Delete the object to free memory
     unset($reader);

 	 logAdd('Stop parsing');		     
 }

	$mongo->close();

 

 

inc/Misc.php

<?php


     /**
      * Convert bytes in 'kb','mb','gb','tb','pb'
      * @param integer $size Data to convert
      * @return string
      */
    function convert($size)	{
        $unit=array('b','kb','mb','gb','tb','pb');
        return @round($size/pow(1024,($i=floor(log($size,1024)))),2).' '.$unit[$i];
    }	


    /**
     * Get memory usage
     * @param boolean $render Displays the result of the function in the browser or not
     */
    function getMemoryUsage($render=true) {
        if (function_exists('memory_get_usage')) {
            $memory_usage = memory_get_usage();
        } else if (substr(PHP_OS,0,3) == 'WIN') {
            // Windows 2000 workaround
            $output = array();
            exec('pslist ' . getmypid() , $output);
            $memory_usage = trim(substr($output[8],38,10));
        } else {
            $memory_usage = '';
        }
        if($render) {
            printf('Memory usage: '.convert($memory_usage));
        } else {
            return $memory_usage;
        }
    }
    
    
    /**
     * Get elapsed time
     * @global integer $start_time Start time value
     * @param boolean $render Displays the result of the function in the browser or not
     */
    function getElapsedTime($render=true) {
        global $start_time;
        $result_time = microtime(true) - $start_time;
        if($render) printf("Elapsed time %.3f seconds",$result_time); else return sprintf("%.3f", $result_time);
    }


    /**
     * Benchmark
     */
    function benchmark() {          
        getMemoryUsage(); echo " - "; getElapsedTime(); echo "\n";
    }


/**
 * Log add
 */	 
function logAdd($message) {		
	file_put_contents('log.txt',$message.' - '.convert(getMemoryUsage(false))." - ".getElapsedTime(false)."\n", FILE_APPEND);
}

?>

 

 

Log

Script start - 338.05 kb - 0.003

Start parsing - 348.74 kb - 0.044

case-file found - 348.77 kb - 0.055

case-file in $sxe - 349.78 kb - 0.068

Insert done! - 422.3 kb - 0.078

Clear the memory - 349.45 kb - 0.082

case-file found - 349.44 kb - 0.302

case-file in $sxe - 350.19 kb - 0.308

Insert done! - 385.02 kb - 0.314

Clear the memory - 349.45 kb - 0.319

case-file found - 349.44 kb - 0.385

case-file in $sxe - 350.19 kb - 0.390

Insert done! - 368.86 kb - 0.395

Clear the memory - 349.45 kb - 0.401

case-file found - 349.44 kb - 0.437

case-file in $sxe - 350.19 kb - 0.441

Insert done! - 366.63 kb - 0.447

Clear the memory - 349.45 kb - 0.452

case-file found - 349.44 kb - 0.489

case-file in $sxe - 350.19 kb - 0.494

Insert done! - 367.61 kb - 0.499

Clear the memory - 349.45 kb - 0.502

case-file found - 349.44 kb - 0.539

case-file in $sxe - 350.19 kb - 0.543

Insert done! - 369.81 kb - 0.550

Clear the memory - 349.45 kb - 0.553

case-file found - 349.44 kb - 0.593

case-file in $sxe - 350.19 kb - 0.596

Insert done! - 366.64 kb - 0.602

Clear the memory - 349.45 kb - 0.607

case-file found - 349.44 kb - 0.641

case-file in $sxe - 350.19 kb - 0.646

Insert done! - 369.09 kb - 0.650

Clear the memory - 349.45 kb - 0.655

 

I do not see any problems, but the process memory leak my script have.

 

 

 

Link to comment
Share on other sites

You can use RegEx to do that.

 

Can you show me a small sample XML file with the data you want to extract?

 

I have a similar problem - I want to create an index from a large xml file (about 2 Mb).

 

I have worked out how to use strreplace and pregreplace to achieve this, but the problem is the xml file will always take a very long time to load, no matter what method I use.

 

So I was wondering (I am a beginner) if there is a way to do this on a website only intermittently - say once a day - and store the result, since the index doesn't need to be "in real time"

 

So "document.xml" (2Mb - changes sometimes daily, sometimes weekly) -> somehow… ->  "a_daily_index.html" (80 Kb) -> accessed by php include to "my_website_index_page.php" hundreds of times a day.

Link to comment
Share on other sites

Yes, I tried to make this script with XML Parser, but difficult to recreate such a parser that I need. I need to recreate all the keys, plus embedded design elements with the same name.

 

 

<?php

     set_time_limit(0);
    
     /*$mongo = new Mongo();
     $tess = $mongo->selectDB('tess');*/
    
     $case = array();
            
     global $case;
    
    
     class SaxClass {
        
         private $serial_number = false;
         private $registration_number = false;       
         private $mark_identification = false;   
         private $status_date = false;           
         private $filing_date = false;       
         private $published_for_opposition_date = false;       
         private $state = false;               
        
         private $p = false;       

                
                
         /**
         * Callback for the start of each element
         */
         function startElement($parser_object, $elementname, $attribute) {
             global $case;
        
             if ($elementname == "serial-number")    $this->serial_number = true; else $this->serial_number = false;       
             if ($elementname == "registration-number") $this->registration_number = true; else $this->registration_number = false;
             if ($elementname == "mark-identification") $this->mark_identification = true; else $this->mark_identification = false;           
             if ($elementname == "status-date") $this->status_date = true; else $this->status_date = false;           
             if ($elementname == "filing-date") $this->filing_date = true; else $this->filing_date = false;                       
             if ($elementname == "published-for-opposition-date") $this->published_for_opposition_date = true; else $this->published_for_opposition_date = false;                       


             if ($elementname == "nationality") {
                 $this->p = true;
             } else {
                 $this->p = false;               
             }                   
            

             if($this->p == true && $elementname == 'state') {
                 $this->state = true;
             } else {
                 $this->state = false;
             }
                
                
             if ($elementname == "case-file") {           
                 echo 'start' . "\n";   
                 $case = array();
             }
         }
        
        
         /**
         * Callback for the end of each element
         */
         function endElement($parser_object, $elementname) {
             global $tess, $case;
        
             if ($elementname == "case-file") {                           
                 print_r($case);
                 //$tess->case_file_spc->insert($case);
                 echo 'end' . "\n";
             }
         }
        
        
         /**
         * Callback for the content within an element
         */
         function contentHandler($parser_object, $data) {
             global $tess, $case;   

             if ($this->serial_number) if($this->cl($data) !== '') $case['serial-number'] = $this->cl($data);
             if ($this->registration_number) if($this->cl($data) !== '') $case['registration-number'] = $this->cl($data);                   
             if ($this->mark_identification) if($this->cl($data) !== '') $case['mark-identification'] = $this->cl($data);   
             if ($this->status_date) if($this->cl($data) !== '') $case['status-date'] = $this->cl($data);   
             if ($this->filing_date) if($this->cl($data) !== '') $case['filing-date'] = $this->cl($data);   
             if ($this->published_for_opposition_date) if($this->cl($data) !== '') $case['published-for-opposition-date'] = $this->cl($data);   
             if ($this->state) if($this->cl($data) !== '') $case['state'] = $this->cl($data);   
            
         }
        
        
         /**
         * Clean text
         */
         function cl($data) {
             return preg_replace('/^\\s+|\\s+$/m', '', $data);
         }
        
     }
    
        
    
     /**
     * Function to start the parsing once all values are set and
     * the file has been opened
     */
     function doParse($parser_object) {

         if(!($fp = fopen("small.xml", "r")));
        
         //loop through data
         while ($data = fread($fp, 4096)) {
             //parse the fragment
             xml_parse($parser_object, $data, feof($fp));
         }
     }
    
    
    
     $SaxObject = new SaxClass();
     $parser_object = xml_parser_create();
    
     xml_set_object($parser_object, $SaxObject);
    
     //Don't alter the case of the data
     xml_parser_set_option($parser_object, XML_OPTION_CASE_FOLDING, false);
    
     xml_set_element_handler($parser_object, "startElement", "endElement");
     xml_set_character_data_handler($parser_object, "contentHandler");
    
     doParse($parser_object);

 

does not work, this condition

 

  if ($elementname == "nationality") {
                 $this->p = true;
             } else {
                 $this->p = false;               
             }                   
            

             if($this->p == true && $elementname == 'state') {
                 $this->state = true;
             } else {
                 $this->state = false;
             }

 

<nationality>

<state>DE</state>

</nationality>

 

start

Array

(

[serial-number] => 77694333

[registration-number] => 0000000

[filing-date] => 20090318

[status-date] => 20090323

[mark-identification] => GC

[published-for-opposition-date] => 20091222

)

end

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.