mikebyrne Posted August 11, 2009 Share Posted August 11, 2009 I'm putting together a piece of code to pull the text from a pdf and place it into a textfile. It seems to work fine and passes all error checks but it just produces a blank text file???? My code is <?php // Function : pdf2txt() // Arguments : $filename - Filename of the PDF you want to extract // Description : Reads a pdf file, extracts data streams, and manages // their translation to plain text - returning the plain // text at the end // Authors : Jonathan Beckett, 2005-05-02 // : Sven Schuberth, 2007-03-29 error_reporting(E_ALL); ini_set('display_errors', TRUE); $path = 'C:\\Users\\Mike\\Desktop\\'; file_put_contents($path . 'txtfile.txt', pdf2txt($path . 'Athy Register.pdf')); if (file_exists($path . 'txtfile.txt')) { echo "File txtfile.txt has been created succesfully and contains <br><br><pre>"; echo file_get_contents($path . 'txtfile.txt'); }else { echo "File txtfile.txt was not created due to an issue."; } function pdf2txt($filename){ $data = getFileData($filename); $s=strpos($data,"%")+1; $version=substr($data,$s,strpos($data,"%",$s)-1); if(substr_count($version,"PDF-1.2")==0) return handleV3($data); else return handleV2($data); } // handles the verson 1.2 function handleV2($data){ // grab objects and then grab their contents (chunks) $a_obj = getDataArray($data,"obj","endobj"); $j = -1; foreach($a_obj as $obj){ $a_filter = getDataArray($obj,"<<",">>"); if ((is_array($a_filter)) && (count($a_filter) > 0)) { $j++; $a_chunks[$j]["filter"] = $a_filter[0]; $a_data = getDataArray($obj,"stream\r\n","endstream"); if ((is_array($a_data)) && (count($a_data) > 0)) { $a_chunks[$j]["data"] = substr($a_data[0], strlen("stream\r\n"), strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream")); } } } // decode the chunks foreach($a_chunks as $chunk){ // look at each chunk and decide how to decode it - by looking at the contents of the filter $a_filter = split("/",$chunk["filter"]); if (isset($chunk['data'])) { // look at the filter to find out which encoding has been used if (substr($chunk["filter"],"FlateDecode")!==false){ $data =@ gzuncompress($chunk["data"]); if (trim($data)!=""){ $result_data .= ps2txt($data); } else { //$result_data .= "x"; } } } } if (isset($result_data)) { return $result_data; } else { return false; } } //handles versions >1.2 function handleV3($data){ // grab objects and then grab their contents (chunks) $a_obj = getDataArray($data,"obj","endobj"); $result_data=""; foreach($a_obj as $obj){ //check if it a string if(substr_count($obj,"/GS1")>0){ //the strings are between ( and ) preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER); if(is_array($field)) foreach($field as $data) $result_data.=$data[1]; } } return $result_data; } function ps2txt($ps_data){ $result = ""; $a_data = getDataArray($ps_data,"[","]"); if (is_array($a_data)){ foreach ($a_data as $ps_text){ $a_text = getDataArray($ps_text,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } } else { // the data may just be in raw format (outside of [] tags) $a_text = getDataArray($ps_data,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } return $result; } function getFileData($filename){ $handle = fopen($filename,"rb"); $data = fread($handle, filesize($filename)); fclose($handle); return $data; } function getDataArray($data,$start_word,$end_word){ $start = 0; $end = 0; $a_result = array(); while ($start!==false && $end!==false){ $start = strpos($data,$start_word,$end); if ($start!==false){ $end = strpos($data,$end_word,$start); if ($end!==false){ // data is between start and end $a_result[] = substr($data,$start,$end-$start+strlen($end_word)); } } } return $a_result; } ?> Link to comment https://forums.phpfreaks.com/topic/169802-code-not-pulling-info-from-pdf/ Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.