mikebyrne Posted August 11, 2009 Share Posted August 11, 2009 I'm putting together a piece of code to pull the text from a pdf and place it into a textfile. It seems to work fine and passes all error checks but it just produces a blank text file???? My code is <?php // Function : pdf2txt() // Arguments : $filename - Filename of the PDF you want to extract // Description : Reads a pdf file, extracts data streams, and manages // their translation to plain text - returning the plain // text at the end // Authors : Jonathan Beckett, 2005-05-02 // : Sven Schuberth, 2007-03-29 error_reporting(E_ALL); ini_set('display_errors', TRUE); $path = 'C:\\Users\\Mike\\Desktop\\'; file_put_contents($path . 'txtfile.txt', pdf2txt($path . 'Athy Register.pdf')); if (file_exists($path . 'txtfile.txt')) { echo "File txtfile.txt has been created succesfully and contains <br><br><pre>"; echo file_get_contents($path . 'txtfile.txt'); }else { echo "File txtfile.txt was not created due to an issue."; } function pdf2txt($filename){ $data = getFileData($filename); $s=strpos($data,"%")+1; $version=substr($data,$s,strpos($data,"%",$s)-1); if(substr_count($version,"PDF-1.2")==0) return handleV3($data); else return handleV2($data); } // handles the verson 1.2 function handleV2($data){ // grab objects and then grab their contents (chunks) $a_obj = getDataArray($data,"obj","endobj"); $j = -1; foreach($a_obj as $obj){ $a_filter = getDataArray($obj,"<<",">>"); if ((is_array($a_filter)) && (count($a_filter) > 0)) { $j++; $a_chunks[$j]["filter"] = $a_filter[0]; $a_data = getDataArray($obj,"stream\r\n","endstream"); if ((is_array($a_data)) && (count($a_data) > 0)) { $a_chunks[$j]["data"] = substr($a_data[0], strlen("stream\r\n"), strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream")); } } } // decode the chunks foreach($a_chunks as $chunk){ // look at each chunk and decide how to decode it - by looking at the contents of the filter $a_filter = split("/",$chunk["filter"]); if (isset($chunk['data'])) { // look at the filter to find out which encoding has been used if (substr($chunk["filter"],"FlateDecode")!==false){ $data =@ gzuncompress($chunk["data"]); if (trim($data)!=""){ $result_data .= ps2txt($data); } else { //$result_data .= "x"; } } } } if (isset($result_data)) { return $result_data; } else { return false; } } //handles versions >1.2 function handleV3($data){ // grab objects and then grab their contents (chunks) $a_obj = getDataArray($data,"obj","endobj"); $result_data=""; foreach($a_obj as $obj){ //check if it a string if(substr_count($obj,"/GS1")>0){ //the strings are between ( and ) preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER); if(is_array($field)) foreach($field as $data) $result_data.=$data[1]; } } return $result_data; } function ps2txt($ps_data){ $result = ""; $a_data = getDataArray($ps_data,"[","]"); if (is_array($a_data)){ foreach ($a_data as $ps_text){ $a_text = getDataArray($ps_text,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } } else { // the data may just be in raw format (outside of [] tags) $a_text = getDataArray($ps_data,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } return $result; } function getFileData($filename){ $handle = fopen($filename,"rb"); $data = fread($handle, filesize($filename)); fclose($handle); return $data; } function getDataArray($data,$start_word,$end_word){ $start = 0; $end = 0; $a_result = array(); while ($start!==false && $end!==false){ $start = strpos($data,$start_word,$end); if ($start!==false){ $end = strpos($data,$end_word,$start); if ($end!==false){ // data is between start and end $a_result[] = substr($data,$start,$end-$start+strlen($end_word)); } } } return $a_result; } ?> Quote Link to comment Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.