Awilum Posted May 20, 2011 Share Posted May 20, 2011 I need to parse large XML files ranging in size from ~ 500 to ~ 1700 Mb. I use XMLReader set_time_limit(0); $start_time = microtime(true); include_once 'inc/Misc.php'; include_once 'inc/Database.php'; $files = array('xml/large_file.xml'); foreach($files as $file) { echo "\n"; echo 'Filename: '.basename($file)."\n"; echo 'Filesize: '.convert(filesize($file))."\n"; echo 'Start parsing...'."\n"; echo "\n"; $reader = new XMLReader(); $reader->open($file); while ($reader->read()) { switch ($reader->nodeType) { case (XMLREADER::ELEMENT): if ($reader->localName == "element-name") { $dom = new DomDocument(); $n = $dom->importNode($reader->expand(),true); $dom->appendChild($n); $sxe = simplexml_import_dom($n); $tess->file_big->insert($sxe); echo "Insert done! "; benchmark(); } } } } Everything is fine in the beginning ... Parsed file and slowly inserted my desired data, but is gradually growing memory consumption and has run out of resources. That is, I took the file to 400 Mb and as long as it is spent parsing of 2000 Mb of RAM and all the resources ran out and the script is stopped. How to deal with large files? ~ 500 to ~ 1700 Mb. Will there XML Parser? Yes, and how to apply it to my problem? Another option could have? Quote Link to comment Share on other sites More sharing options...
Awilum Posted May 24, 2011 Author Share Posted May 24, 2011 I rewrote the script. Frees memory as I can. For data storage use MongoDB. General problem that runs the memory remains. But this should not be with XMLReader i think. xmlreader.php <?php $start_time = microtime(true); include 'inc/Misc.php'; logAdd('Script start'); set_time_limit(0); // Try to enable garbage collection on 5.3+ if (function_exists('gc_enable') && !gc_enabled()) { gc_enable(); } //$mongo = new Mongo(); $db = 'tess'; $collection = 'apc'; $files = array(/*'xml/apc101231-42.xml',*/ /*'xml/apc110101.xml'*/ 'xml/apc110101.xml'); foreach($files as $file) { echo "\n"; echo 'Filename: '.basename($file)."\n"; echo 'Start parsing...'."\n"; echo "\n"; $reader = new XMLReader(); $reader->open($file); logAdd('Srart parsing'); while ($reader->read()) { switch ($reader->nodeType) { case (XMLREADER::ELEMENT): if ($reader->localName == "case-file") { logAdd('case-file found'); $dom = new DomDocument(); $n = $dom->importNode($reader->expand(),true); $dom->appendChild($n); $sxe = simplexml_import_dom($n); logAdd('case-file in $sxe'); // Insert data! //$mongo->$db->$collection->insert($sxe); logAdd('Insert done!'); //print_r($sxe); echo "Insert done! \n"; // Now clear the memory. unset($n, $dom, $sxe); logAdd('Clear the memory'); } break; } logAdd('case-file in $sxe'); } // Close the resource $reader->close(); // Delete the object to free memory unset($reader); logAdd('Stop parsing'); } $mongo->close(); inc/Misc.php <?php /** * Convert bytes in 'kb','mb','gb','tb','pb' * @param integer $size Data to convert * @return string */ function convert($size) { $unit=array('b','kb','mb','gb','tb','pb'); return @round($size/pow(1024,($i=floor(log($size,1024)))),2).' '.$unit[$i]; } /** * Get memory usage * @param boolean $render Displays the result of the function in the browser or not */ function getMemoryUsage($render=true) { if (function_exists('memory_get_usage')) { $memory_usage = memory_get_usage(); } else if (substr(PHP_OS,0,3) == 'WIN') { // Windows 2000 workaround $output = array(); exec('pslist ' . getmypid() , $output); $memory_usage = trim(substr($output[8],38,10)); } else { $memory_usage = ''; } if($render) { printf('Memory usage: '.convert($memory_usage)); } else { return $memory_usage; } } /** * Get elapsed time * @global integer $start_time Start time value * @param boolean $render Displays the result of the function in the browser or not */ function getElapsedTime($render=true) { global $start_time; $result_time = microtime(true) - $start_time; if($render) printf("Elapsed time %.3f seconds",$result_time); else return sprintf("%.3f", $result_time); } /** * Benchmark */ function benchmark() { getMemoryUsage(); echo " - "; getElapsedTime(); echo "\n"; } /** * Log add */ function logAdd($message) { file_put_contents('log.txt',$message.' - '.convert(getMemoryUsage(false))." - ".getElapsedTime(false)."\n", FILE_APPEND); } ?> Log Script start - 338.05 kb - 0.003 Start parsing - 348.74 kb - 0.044 case-file found - 348.77 kb - 0.055 case-file in $sxe - 349.78 kb - 0.068 Insert done! - 422.3 kb - 0.078 Clear the memory - 349.45 kb - 0.082 case-file found - 349.44 kb - 0.302 case-file in $sxe - 350.19 kb - 0.308 Insert done! - 385.02 kb - 0.314 Clear the memory - 349.45 kb - 0.319 case-file found - 349.44 kb - 0.385 case-file in $sxe - 350.19 kb - 0.390 Insert done! - 368.86 kb - 0.395 Clear the memory - 349.45 kb - 0.401 case-file found - 349.44 kb - 0.437 case-file in $sxe - 350.19 kb - 0.441 Insert done! - 366.63 kb - 0.447 Clear the memory - 349.45 kb - 0.452 case-file found - 349.44 kb - 0.489 case-file in $sxe - 350.19 kb - 0.494 Insert done! - 367.61 kb - 0.499 Clear the memory - 349.45 kb - 0.502 case-file found - 349.44 kb - 0.539 case-file in $sxe - 350.19 kb - 0.543 Insert done! - 369.81 kb - 0.550 Clear the memory - 349.45 kb - 0.553 case-file found - 349.44 kb - 0.593 case-file in $sxe - 350.19 kb - 0.596 Insert done! - 366.64 kb - 0.602 Clear the memory - 349.45 kb - 0.607 case-file found - 349.44 kb - 0.641 case-file in $sxe - 350.19 kb - 0.646 Insert done! - 369.09 kb - 0.650 Clear the memory - 349.45 kb - 0.655 I do not see any problems, but the process memory leak my script have. Quote Link to comment Share on other sites More sharing options...
xyph Posted May 24, 2011 Share Posted May 24, 2011 I'm assuming XMLReader parses the entire file at once You may have to read the large files line-by-line using fopen()/fgets()/fclose() and parse the XML manually. Quote Link to comment Share on other sites More sharing options...
Awilum Posted May 24, 2011 Author Share Posted May 24, 2011 xyph This makes XMLReader or you think not ? >fopen()/fgets()/fclose() I can not find / think of a solution as a line otherwise collect everything in between <case-file> </case-file> Quote Link to comment Share on other sites More sharing options...
xyph Posted May 24, 2011 Share Posted May 24, 2011 You can use RegEx to do that. Can you show me a small sample XML file with the data you want to extract? Quote Link to comment Share on other sites More sharing options...
Awilum Posted May 25, 2011 Author Share Posted May 25, 2011 http://commondatastorage.googleapis.com/trademarks/applications/2011/apc110101.zip http://www.google.com/googlebooks/uspto-trademarks-recent-applications.html Quote Link to comment Share on other sites More sharing options...
mac_gabe Posted May 25, 2011 Share Posted May 25, 2011 You can use RegEx to do that. Can you show me a small sample XML file with the data you want to extract? I have a similar problem - I want to create an index from a large xml file (about 2 Mb). I have worked out how to use strreplace and pregreplace to achieve this, but the problem is the xml file will always take a very long time to load, no matter what method I use. So I was wondering (I am a beginner) if there is a way to do this on a website only intermittently - say once a day - and store the result, since the index doesn't need to be "in real time" So "document.xml" (2Mb - changes sometimes daily, sometimes weekly) -> somehow… -> "a_daily_index.html" (80 Kb) -> accessed by php include to "my_website_index_page.php" hundreds of times a day. Quote Link to comment Share on other sites More sharing options...
gizmola Posted May 25, 2011 Share Posted May 25, 2011 I'm not seeing what the problem is here. The xmlreader and sax libraries both allow you to parse large files in pieces, and the code you have looks like it works fine and handles memory ok. Quote Link to comment Share on other sites More sharing options...
Awilum Posted May 25, 2011 Author Share Posted May 25, 2011 gizmola YES I Know but why proccess php.exe is leak mush memory when i parse data... I dont why.. Quote Link to comment Share on other sites More sharing options...
gizmola Posted May 25, 2011 Share Posted May 25, 2011 I didn't see from your log file it was leading memory. What is showing you that you have a memory leak? Quote Link to comment Share on other sites More sharing options...
Awilum Posted May 25, 2011 Author Share Posted May 25, 2011 Quote Link to comment Share on other sites More sharing options...
gizmola Posted May 25, 2011 Share Posted May 25, 2011 Well, that is unfortunate. My guess is that it is a windows specific issue possibly calling libraries that are not thread safe under windows. You could try the old expat libary to see if that works better under windows. http://php.net/manual/en/book.xml.php Quote Link to comment Share on other sites More sharing options...
Awilum Posted May 25, 2011 Author Share Posted May 25, 2011 Yes, I tried to make this script with XML Parser, but difficult to recreate such a parser that I need. I need to recreate all the keys, plus embedded design elements with the same name. <?php set_time_limit(0); /*$mongo = new Mongo(); $tess = $mongo->selectDB('tess');*/ $case = array(); global $case; class SaxClass { private $serial_number = false; private $registration_number = false; private $mark_identification = false; private $status_date = false; private $filing_date = false; private $published_for_opposition_date = false; private $state = false; private $p = false; /** * Callback for the start of each element */ function startElement($parser_object, $elementname, $attribute) { global $case; if ($elementname == "serial-number") $this->serial_number = true; else $this->serial_number = false; if ($elementname == "registration-number") $this->registration_number = true; else $this->registration_number = false; if ($elementname == "mark-identification") $this->mark_identification = true; else $this->mark_identification = false; if ($elementname == "status-date") $this->status_date = true; else $this->status_date = false; if ($elementname == "filing-date") $this->filing_date = true; else $this->filing_date = false; if ($elementname == "published-for-opposition-date") $this->published_for_opposition_date = true; else $this->published_for_opposition_date = false; if ($elementname == "nationality") { $this->p = true; } else { $this->p = false; } if($this->p == true && $elementname == 'state') { $this->state = true; } else { $this->state = false; } if ($elementname == "case-file") { echo 'start' . "\n"; $case = array(); } } /** * Callback for the end of each element */ function endElement($parser_object, $elementname) { global $tess, $case; if ($elementname == "case-file") { print_r($case); //$tess->case_file_spc->insert($case); echo 'end' . "\n"; } } /** * Callback for the content within an element */ function contentHandler($parser_object, $data) { global $tess, $case; if ($this->serial_number) if($this->cl($data) !== '') $case['serial-number'] = $this->cl($data); if ($this->registration_number) if($this->cl($data) !== '') $case['registration-number'] = $this->cl($data); if ($this->mark_identification) if($this->cl($data) !== '') $case['mark-identification'] = $this->cl($data); if ($this->status_date) if($this->cl($data) !== '') $case['status-date'] = $this->cl($data); if ($this->filing_date) if($this->cl($data) !== '') $case['filing-date'] = $this->cl($data); if ($this->published_for_opposition_date) if($this->cl($data) !== '') $case['published-for-opposition-date'] = $this->cl($data); if ($this->state) if($this->cl($data) !== '') $case['state'] = $this->cl($data); } /** * Clean text */ function cl($data) { return preg_replace('/^\\s+|\\s+$/m', '', $data); } } /** * Function to start the parsing once all values are set and * the file has been opened */ function doParse($parser_object) { if(!($fp = fopen("small.xml", "r"))); //loop through data while ($data = fread($fp, 4096)) { //parse the fragment xml_parse($parser_object, $data, feof($fp)); } } $SaxObject = new SaxClass(); $parser_object = xml_parser_create(); xml_set_object($parser_object, $SaxObject); //Don't alter the case of the data xml_parser_set_option($parser_object, XML_OPTION_CASE_FOLDING, false); xml_set_element_handler($parser_object, "startElement", "endElement"); xml_set_character_data_handler($parser_object, "contentHandler"); doParse($parser_object); does not work, this condition if ($elementname == "nationality") { $this->p = true; } else { $this->p = false; } if($this->p == true && $elementname == 'state') { $this->state = true; } else { $this->state = false; } <nationality> <state>DE</state> </nationality> start Array ( [serial-number] => 77694333 [registration-number] => 0000000 [filing-date] => 20090318 [status-date] => 20090323 [mark-identification] => GC [published-for-opposition-date] => 20091222 ) end Quote Link to comment Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.