Jump to content

mikebyrne

Members
  • Posts

    780
  • Joined

  • Last visited

Posts posted by mikebyrne

  1. Thanks

     

    I'm just getting a mountain of nonsense ie

     

    %PDF-1.2 %âãÏÓ 1 0 obj << /Type /Catalog /Pages 2 0 R /PageMode /UseNone /ViewerPreferences << /FitWindow true /PageLayout /SinglePage /NonFullScreenPageMode /UseNone >> >> endobj 2 0 obj << /Type /Pages /Kids [ 8 0 R 30 0 R 40 0 R 44 0 R 48 0 R 52 0 R 56 0 R 60 0 R 64 0 R 68 0 R 72 0 R 76 0 R 80 0 R 84 0 R 88 0 R 92 0 R 96 0 R 100 0 R 104 0 R 108 0 R 112 0 R 116 0 R 120 0 R 124 0 R 128 0 R 132 0 R 136 0 R 140 0 R 144 0 R 148 0 R 152 0 R 156 0 R 160 0 R 164 0 R 168 0 R 172 0 R 176 0 R 180 0 R 184 0 R 188 0 R 192 0 R 196 0 R 200 0 R 204 0 R 208 0 R 212 0 R 216 0 R 220 0 R 224 0 R 228 0 R 232 0 R 236 0 R 240 0 R 244 0 R 248 0 R 252 0 R 256 0 R 260 0 R 264 0 R 268 0 R 272 0 R 276 0 R 280 0 R 284 0 R 288 0 R 292 0 R 296 0 R 300 0 R 304 0 R 308 0 R 312 0 R 316 0 R 320 0 R 324 0 R 328 0 R 332 0 R 336 0 R 340 0 R 344 0 R 348 0 R 352 0 R 368 0 R 372 0 R 376 0 R 380 0 R ] /Count 85 /MediaBox 3 0 R /CropBox 4 0 R >> endobj 3 0 obj [ 0 0 595 841 ] endobj 4 0 obj [ 0 0 595 841 ] endobj 5 0 obj << /Length 1701 /Filter [ /FlateDecode ] >> stream xœÅ™ÛrÛ6†ïù¸«3ã*�€@{åSÓC’¦–ÓÜø¦)‹ .I9£Çë›'@‰±"³S{ÆÚßZ°X`I¤�êßÖˆóàõeò”ÆÉõ›s×Cu±0ø5øÊš‹º¿Ë‹�Cð% ,L„äBaȬȂeðxþJÂPd¯„ÌŠþJ#€¥X9®[Fî*ã—ëàSP<-ˆðP7°²œê;G+F"‹‡iäTÖ*ÊÕe™stÂv¯.Ý4+‚¸Yí©ãàü&@&‚™þ·þQÁq+nòàä:yHë&©@¹WY7eUƒWàæ¯àêFÃy) ¼Fõ¼Px­ób5ð†!Ä=/¢ï/X•Uœ€UUæêëf ~J¶�C(@SªÙöÿ‹à΀Bª9C!5 ÊY+²VPÁ”pnV91ì†2Άù”i.Ê¢nÒf“ñö‡]TAõrÂ"ìPèPÕÜÖ͉cQ íQ „|ˆz‘6[ðZÁnŠf&ƒz8L§SŽÍ:¾T·1 ú[šÝË*™ t1Gó¨z«l ㋢îÁ²Ü4뉀cÎúµiDpLqp'Ž x¢ŽRå±Gù¡Ì²´x�—jƒ¨Ò¸Ù¹Jv3Q|EZtK‡BÏ®u³âHRŒ9íçôjS•‰,ÀYe©Ì“¢y&©Ôeð€ãÁs».Œ›Ç’‡‘èçX„l2©ÀÛ2–Y» +ë¬JäT¦9ö6·,¼SØ:¾ŒÞ[Çcú³f½Â³‹¢Å³«¢ƒ5+¡ÇK–…Ÿfãe!ëfÏe—B°K¯VÙ£ß9Z1ËìòKÏø¼9ó-&ÅB÷¨¨ÄÔ·ØÖmTžP¢Ê„× Ö‰¬Ìçf…?J)!¥n£%ŠÜ·‹,Ù¤eq{Rß¾Ú Ê€BsÑ1µ*¸Çµb”£p‰k‚r^nkð^6°Œ×e™©(]—ªÜ€hŠ\å#ÖV!Hzr+f!UF_'ÇSä(±:9b¹sCÎÈáäá¹ÊG¬­¢˜÷äVÌB¾/…'ÉɹÊG¬­Â¤OK'f!ÇœNN§È1d¨Ã³JÚ%¨³gáV?‡s³ý܆'÷A`¬KNgÏ@­ŠÊoHÍhZãä>§aÔ女ç€&ø²’O@kœÜçtB•Ø´µç€F~E²”©ªýþL‹XÕ€ßÕàç²~L™‚½µJO“û˜Vp»dtöÌ\ÝÍ{Ìq©nß­Óx­>ßËr%Ÿ;pLîS:AQ—†Îž™2x(òÄIc`rŸÒ Œ»töÈ8‡"O1&÷)­ˆT!Ü";{äHpr(òÄÙb`rŸÒ FºüsöÈÌ/¹¿†L÷Õ¹7õk¨—³±!E†SUó#a¼FU

  2. I've put the print_r after each function but im still getting a blank screen

     

    <?php
    // Function    : pdf2txt()
    // Arguments   : $filename - Filename of the PDF you want to extract
    // Description : Reads a pdf file, extracts data streams, and manages
    //               their translation to plain text - returning the plain
    //               text at the end
    // Authors      : Jonathan Beckett, 2005-05-02
    //                            : Sven Schuberth, 2007-03-29
    
    $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Register.pdf');
    
    
    function pdf2txt($filename){    
    
        $data = getFileData($filename);
       
        $s=strpos($data,"%")+1;
       
        $version=substr($data,$s,strpos($data,"%",$s)-1);
        if(substr_count($version,"PDF-1.2")==0)
            return handleV3($data);
        else
            return handleV2($data);
    
       
    }
    // handles the verson 1.2
    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
         $j = -1;
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if ((is_array($a_filter)) && (count($a_filter) > 0)) {
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if ((is_array($a_data)) && (count($a_data) > 0)) {
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
            }
        }
    
        // decode the chunks
        foreach($a_chunks as $chunk){
    
            // look at each chunk and decide how to decode it - by looking at the contents of the filter
            $a_filter = split("/",$chunk["filter"]);
           
            if (isset($chunk['data'])) {
                // look at the filter to find out which encoding has been used           
                if (substr($chunk["filter"],"FlateDecode")!==false){
                    $data =@ gzuncompress($chunk["data"]);
                    if (trim($data)!=""){
                        $result_data .= ps2txt($data);
                    } else {
                   
                        //$result_data .= "x";
                    }
                }
            }
        }
       
        if (isset($result_data)) {
        return $result_data;
    } else {
        return false;
    }
    }
    
    //handles versions >1.2
    function handleV3($data){
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
        $result_data="";
        foreach($a_obj as $obj){
            //check if it a string
            if(substr_count($obj,"/GS1")>0){
                //the strings are between ( and )
                preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
                if(is_array($field))
                    foreach($field as $data)
                        $result_data.=$data[1];
            }
        }
        return $result_data;
        print_r ($a_result);
        
    }
    
    function ps2txt($ps_data){
        $result = "";
        $a_data = getDataArray($ps_data,"[","]");
        if (is_array($a_data)){
            foreach ($a_data as $ps_text){
                $a_text = getDataArray($ps_text,"(",")");
                if (is_array($a_text)){
                    foreach ($a_text as $text){
                        $result .= substr($text,1,strlen($text)-2);
                    }
                }
            }
        } else {
            // the data may just be in raw format (outside of [] tags)
            $a_text = getDataArray($ps_data,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
        return $result;
        print_r ($result);
    }
    
    function getFileData($filename){
        $handle = fopen($filename,"rb");
        $data = fread($handle, filesize($filename));
        fclose($handle);
        return $data;
        print_r ($data);
    }
    
    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        $a_result = array();
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
        print_r ($a_result);
    }
    error_reporting(E_ALL);
    ini_set('display_errors', TRUE);
    
    file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf'));
    ?>
    

  3. I've replaced that line but im still getting two errors 

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 44

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 46

     

    Its got rid of one error

     

    Line 44 is

     

    $a_chunks[$j]["data"] = substr($a_data[0],
    

     

    Line 46 is

    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
    

  4. Im working on a pdf to textfile program and I'm getting the following errora

     

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 40

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 44

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 46

     

    The lines are in the function

     

    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
         $j = -1;
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
    

     

    My complete code is:

     

    <?php
    // Function    : pdf2txt()
    // Arguments   : $filename - Filename of the PDF you want to extract
    // Description : Reads a pdf file, extracts data streams, and manages
    //               their translation to plain text - returning the plain
    //               text at the end
    // Authors      : Jonathan Beckett, 2005-05-02
    //                            : Sven Schuberth, 2007-03-29
    
    $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Register.pdf');
    
    
    function pdf2txt($filename){    
    
        $data = getFileData($filename);
       
        $s=strpos($data,"%")+1;
       
        $version=substr($data,$s,strpos($data,"%",$s)-1);
        if(substr_count($version,"PDF-1.2")==0)
            return handleV3($data);
        else
            return handleV2($data);
    
       
    }
    // handles the verson 1.2
    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
         $j = -1;
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
            }
        }
    
        // decode the chunks
        foreach($a_chunks as $chunk){
    
            // look at each chunk and decide how to decode it - by looking at the contents of the filter
            $a_filter = split("/",$chunk["filter"]);
           
            if ($chunk["data"]!=""){
                // look at the filter to find out which encoding has been used           
                if (substr($chunk["filter"],"FlateDecode")!==false){
                    $data =@ gzuncompress($chunk["data"]);
                    if (trim($data)!=""){
                        $result_data .= ps2txt($data);
                    } else {
                   
                        //$result_data .= "x";
                    }
                }
            }
        }
       
        return $result_data;
    }
    
    //handles versions >1.2
    function handleV3($data){
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
        $result_data="";
        foreach($a_obj as $obj){
            //check if it a string
            if(substr_count($obj,"/GS1")>0){
                //the strings are between ( and )
                preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
                if(is_array($field))
                    foreach($field as $data)
                        $result_data.=$data[1];
            }
        }
        return $result_data;
        
    }
    
    function ps2txt($ps_data){
        $result = "";
        $a_data = getDataArray($ps_data,"[","]");
        if (is_array($a_data)){
            foreach ($a_data as $ps_text){
                $a_text = getDataArray($ps_text,"(",")");
                if (is_array($a_text)){
                    foreach ($a_text as $text){
                        $result .= substr($text,1,strlen($text)-2);
                    }
                }
            }
        } else {
            // the data may just be in raw format (outside of [] tags)
            $a_text = getDataArray($ps_data,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
        return $result;
    }
    
    function getFileData($filename){
        $handle = fopen($filename,"rb");
        $data = fread($handle, filesize($filename));
        fclose($handle);
        return $data;
    }
    
    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        $a_result = array();
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
    }
    error_reporting(E_ALL);
    ini_set('display_errors', TRUE);
    
    file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf'));
    ?>
    

     

    Any help on this would be great

  5. I'm noe getting the error

     

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 40

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 44

     

    Notice: Undefined offset: 0 in C:\xampp\htdocs\pdf2txt_test.php on line 46

     

    The lines are in the function

     

    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
         $j = -1;
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
    

     

  6. Thanks that seemed to fix it

     

    Im now getting the error

     

    Notice: Undefined variable: a_result in C:\xampp\htdocs\pdf2txt_test.php on line 140

     

    line 140 is in the function

     

    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        unset($a_result);
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
    

     

     

     

  7. I'm trying to modify a piece of code to allow me to take the contents of a pdf document and send the output to a textfile

     

    My code is

     

    <?php
    // Function    : pdf2txt()
    // Arguments   : $filename - Filename of the PDF you want to extract
    // Description : Reads a pdf file, extracts data streams, and manages
    //               their translation to plain text - returning the plain
    //               text at the end
    // Authors      : Jonathan Beckett, 2005-05-02
    //                            : Sven Schuberth, 2007-03-29
    
    function pdf2txt($filename){    
    
        $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf');
    
        $data = getFileData($filename);
       
        $s=strpos($data,"%")+1;
       
        $version=substr($data,$s,strpos($data,"%",$s)-1);
        if(substr_count($version,"PDF-1.2")==0)
            return handleV3($data);
        else
            return handleV2($data);
    
       
    }
    // handles the verson 1.2
    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
            }
        }
    
        // decode the chunks
        foreach($a_chunks as $chunk){
    
            // look at each chunk and decide how to decode it - by looking at the contents of the filter
            $a_filter = split("/",$chunk["filter"]);
           
            if ($chunk["data"]!=""){
                // look at the filter to find out which encoding has been used           
                if (substr($chunk["filter"],"FlateDecode")!==false){
                    $data =@ gzuncompress($chunk["data"]);
                    if (trim($data)!=""){
                        $result_data .= ps2txt($data);
                    } else {
                   
                        //$result_data .= "x";
                    }
                }
            }
        }
       
        return $result_data;
    }
    
    //handles versions >1.2
    function handleV3($data){
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
        $result_data="";
        foreach($a_obj as $obj){
            //check if it a string
            if(substr_count($obj,"/GS1")>0){
                //the strings are between ( and )
                preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
                if(is_array($field))
                    foreach($field as $data)
                        $result_data.=$data[1];
            }
        }
        return $result_data;
        
    }
    
    function ps2txt($ps_data){
        $result = "";
        $a_data = getDataArray($ps_data,"[","]");
        if (is_array($a_data)){
            foreach ($a_data as $ps_text){
                $a_text = getDataArray($ps_text,"(",")");
                if (is_array($a_text)){
                    foreach ($a_text as $text){
                        $result .= substr($text,1,strlen($text)-2);
                    }
                }
            }
        } else {
            // the data may just be in raw format (outside of [] tags)
            $a_text = getDataArray($ps_data,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
        return $result;
    }
    
    function getFileData($filename){
        $handle = fopen($filename,"rb");
        $data = fread($handle, filesize($filename));
        fclose($handle);
        return $data;
    }
    
    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        unset($a_result);
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
    }
    error_reporting(E_ALL);
    ini_set('display_errors', TRUE);
    
    file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf'));
    ?>
    

     

    When I run the code it jst seems to freeze, any idea why??

  8. I've used php for quite a whhile now but this think is driving me mental!

     

    I didn't remove any php tags and  the output is in the browser

     

    The code looks like this

     

    <?php
    // Function    : pdf2txt()
    // Arguments   : $filename - Filename of the PDF you want to extract
    // Description : Reads a pdf file, extracts data streams, and manages
    //               their translation to plain text - returning the plain
    //               text at the end
    // Authors      : Jonathan Beckett, 2005-05-02
    //                            : Sven Schuberth, 2007-03-29
    
    function pdf2txt($filename){    
    
        $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf');
    
        $data = getFileData($filename);
       
        $s=strpos($data,"%")+1;
       
        $version=substr($data,$s,strpos($data,"%",$s)-1);
        if(substr_count($version,"PDF-1.2")==0)
            return handleV3($data);
        else
            return handleV2($data);
    
       
    }
    // handles the verson 1.2
    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
            }
        }
    
        // decode the chunks
        foreach($a_chunks as $chunk){
    
            // look at each chunk and decide how to decode it - by looking at the contents of the filter
            $a_filter = split("/",$chunk["filter"]);
           
            if ($chunk["data"]!=""){
                // look at the filter to find out which encoding has been used           
                if (substr($chunk["filter"],"FlateDecode")!==false){
                    $data =@ gzuncompress($chunk["data"]);
                    if (trim($data)!=""){
                        $result_data .= ps2txt($data);
                    } else {
                   
                        //$result_data .= "x";
                    }
                }
            }
        }
       
        return $result_data;
    }
    
    //handles versions >1.2
    function handleV3($data){
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
        $result_data="";
        foreach($a_obj as $obj){
            //check if it a string
            if(substr_count($obj,"/GS1")>0){
                //the strings are between ( and )
                preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
                if(is_array($field))
                    foreach($field as $data)
                        $result_data.=$data[1];
            }
        }
        return $result_data;
        
    }
    
    function ps2txt($ps_data){
        $result = "";
        $a_data = getDataArray($ps_data,"[","]");
        if (is_array($a_data)){
            foreach ($a_data as $ps_text){
                $a_text = getDataArray($ps_text,"(",")");
                if (is_array($a_text)){
                    foreach ($a_text as $text){
                        $result .= substr($text,1,strlen($text)-2);
                    }
                }
            }
        } else {
            // the data may just be in raw format (outside of [] tags)
            $a_text = getDataArray($ps_data,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
        return $result;
    }
    
    function getFileData($filename){
        $handle = fopen($filename,"rb");
        $data = fread($handle, filesize($filename));
        fclose($handle);
        return $data;
    }
    
    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        unset($a_result);
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
    }
    error_reporting(E_ALL);
    ini_set('display_errors', TRUE);
    
    file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf'));
    ?>
    

     

     

  9. Im getting the following output

     

    >"); if (is_array($a_filter)){ $j++; $a_chunks[$j]["filter"] = $a_filter[0]; $a_data = getDataArray($obj,"stream\r\n","endstream"); if (is_array($a_data)){ $a_chunks[$j]["data"] = substr($a_data[0], strlen("stream\r\n"), strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream")); } } } // decode the chunks foreach($a_chunks as $chunk){ // look at each chunk and decide how to decode it - by looking at the contents of the filter $a_filter = split("/",$chunk["filter"]); if ($chunk["data"]!=""){ // look at the filter to find out which encoding has been used if (substr($chunk["filter"],"FlateDecode")!==false){ $data =@ gzuncompress($chunk["data"]); if (trim($data)!=""){ $result_data .= ps2txt($data); } else { //$result_data .= "x"; } } } } return $result_data; } //handles versions >1.2 function handleV3($data){ // grab objects and then grab their contents (chunks) $a_obj = getDataArray($data,"obj","endobj"); $result_data=""; foreach($a_obj as $obj){ //check if it a string if(substr_count($obj,"/GS1")>0){ //the strings are between ( and ) preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER); if(is_array($field)) foreach($field as $data) $result_data.=$data[1]; } } return $result_data; } function ps2txt($ps_data){ $result = ""; $a_data = getDataArray($ps_data,"[","]"); if (is_array($a_data)){ foreach ($a_data as $ps_text){ $a_text = getDataArray($ps_text,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } } else { // the data may just be in raw format (outside of [] tags) $a_text = getDataArray($ps_data,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } return $result; } function getFileData($filename){ $handle = fopen($filename,"rb"); $data = fread($handle, filesize($filename)); fclose($handle); return $data; } function getDataArray($data,$start_word,$end_word){ $start = 0; $end = 0; unset($a_result); while ($start!==false && $end!==false){ $start = strpos($data,$start_word,$end); if ($start!==false){ $end = strpos($data,$end_word,$start); if ($end!==false){ // data is between start and end $a_result[] = substr($data,$start,$end-$start+strlen($end_word)); } } } return $a_result; } error_reporting(E_ALL); ini_set('display_errors', TRUE); file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf')); ?>

  10. I run the code to take text from a pdf and place it into a text file but I get a blank screen with no errors.

     

    The textfile is not created on the either??

     

    Any ideas why its not working?

     

    <?php
    // Function    : pdf2txt()
    // Arguments   : $filename - Filename of the PDF you want to extract
    // Description : Reads a pdf file, extracts data streams, and manages
    //               their translation to plain text - returning the plain
    //               text at the end
    // Authors      : Jonathan Beckett, 2005-05-02
    //                            : Sven Schuberth, 2007-03-29
    
    function pdf2txt($filename){    
    
        $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf');
    
        $data = getFileData($filename);
       
        $s=strpos($data,"%")+1;
       
        $version=substr($data,$s,strpos($data,"%",$s)-1);
        if(substr_count($version,"PDF-1.2")==0)
            return handleV3($data);
        else
            return handleV2($data);
    
       
    }
    // handles the verson 1.2
    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
            }
        }
    
        // decode the chunks
        foreach($a_chunks as $chunk){
    
            // look at each chunk and decide how to decode it - by looking at the contents of the filter
            $a_filter = split("/",$chunk["filter"]);
           
            if ($chunk["data"]!=""){
                // look at the filter to find out which encoding has been used           
                if (substr($chunk["filter"],"FlateDecode")!==false){
                    $data =@ gzuncompress($chunk["data"]);
                    if (trim($data)!=""){
                        $result_data .= ps2txt($data);
                    } else {
                   
                        //$result_data .= "x";
                    }
                }
            }
        }
       
        return $result_data;
    }
    
    //handles versions >1.2
    function handleV3($data){
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
        $result_data="";
        foreach($a_obj as $obj){
            //check if it a string
            if(substr_count($obj,"/GS1")>0){
                //the strings are between ( and )
                preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
                if(is_array($field))
                    foreach($field as $data)
                        $result_data.=$data[1];
            }
        }
        return $result_data;
        
    }
    
    function ps2txt($ps_data){
        $result = "";
        $a_data = getDataArray($ps_data,"[","]");
        if (is_array($a_data)){
            foreach ($a_data as $ps_text){
                $a_text = getDataArray($ps_text,"(",")");
                if (is_array($a_text)){
                    foreach ($a_text as $text){
                        $result .= substr($text,1,strlen($text)-2);
                    }
                }
            }
        } else {
            // the data may just be in raw format (outside of [] tags)
            $a_text = getDataArray($ps_data,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
        return $result;
    }
    
    function getFileData($filename){
        $handle = fopen($filename,"rb");
        $data = fread($handle, filesize($filename));
        fclose($handle);
        return $data;
    }
    
    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        unset($a_result);
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
    }
    error_reporting(E_ALL);
    ini_set('display_errors', TRUE);
    
    // modify the paths
    file_put_contents('C:\Users\Mike\Desktop\txtfile.txt', pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf'));
    ?>
    

  11. I run the test file and I get the blank screen with no errors but the textfile is not created on the desktop??

     

    My code looks like this:

     

    <?php
    // Function    : pdf2txt()
    // Arguments   : $filename - Filename of the PDF you want to extract
    // Description : Reads a pdf file, extracts data streams, and manages
    //               their translation to plain text - returning the plain
    //               text at the end
    // Authors      : Jonathan Beckett, 2005-05-02
    //                            : Sven Schuberth, 2007-03-29
    
    function pdf2txt($filename){    
    
        $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf');
    
        $data = getFileData($filename);
       
        $s=strpos($data,"%")+1;
       
        $version=substr($data,$s,strpos($data,"%",$s)-1);
        if(substr_count($version,"PDF-1.2")==0)
            return handleV3($data);
        else
            return handleV2($data);
    
       
    }
    // handles the verson 1.2
    function handleV2($data){
           
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
       
        foreach($a_obj as $obj){
           
            $a_filter = getDataArray($obj,"<<",">>");
       
            if (is_array($a_filter)){
                $j++;
                $a_chunks[$j]["filter"] = $a_filter[0];
    
                $a_data = getDataArray($obj,"stream\r\n","endstream");
                if (is_array($a_data)){
                    $a_chunks[$j]["data"] = substr($a_data[0],
    strlen("stream\r\n"),
    strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
                }
            }
        }
    
        // decode the chunks
        foreach($a_chunks as $chunk){
    
            // look at each chunk and decide how to decode it - by looking at the contents of the filter
            $a_filter = split("/",$chunk["filter"]);
           
            if ($chunk["data"]!=""){
                // look at the filter to find out which encoding has been used           
                if (substr($chunk["filter"],"FlateDecode")!==false){
                    $data =@ gzuncompress($chunk["data"]);
                    if (trim($data)!=""){
                        $result_data .= ps2txt($data);
                    } else {
                   
                        //$result_data .= "x";
                    }
                }
            }
        }
       
        return $result_data;
    }
    
    //handles versions >1.2
    function handleV3($data){
        // grab objects and then grab their contents (chunks)
        $a_obj = getDataArray($data,"obj","endobj");
        $result_data="";
        foreach($a_obj as $obj){
            //check if it a string
            if(substr_count($obj,"/GS1")>0){
                //the strings are between ( and )
                preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
                if(is_array($field))
                    foreach($field as $data)
                        $result_data.=$data[1];
            }
        }
        return $result_data;
        
    }
    
    function ps2txt($ps_data){
        $result = "";
        $a_data = getDataArray($ps_data,"[","]");
        if (is_array($a_data)){
            foreach ($a_data as $ps_text){
                $a_text = getDataArray($ps_text,"(",")");
                if (is_array($a_text)){
                    foreach ($a_text as $text){
                        $result .= substr($text,1,strlen($text)-2);
                    }
                }
            }
        } else {
            // the data may just be in raw format (outside of [] tags)
            $a_text = getDataArray($ps_data,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
        return $result;
    }
    
    function getFileData($filename){
        $handle = fopen($filename,"rb");
        $data = fread($handle, filesize($filename));
        fclose($handle);
        return $data;
    }
    
    function getDataArray($data,$start_word,$end_word){
    
        $start = 0;
        $end = 0;
        unset($a_result);
       
        while ($start!==false && $end!==false){
            $start = strpos($data,$start_word,$end);
            if ($start!==false){
                $end = strpos($data,$end_word,$start);
                if ($end!==false){
                    // data is between start and end
                    $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
                }
            }
        }
        return $a_result;
    }
    error_reporting(E_ALL);
    ini_set('display_errors', TRUE);
    
    // modify the paths
    file_put_contents('C:\Users\Mike\Desktop\txtfile.txt', pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf'));
    ?>
    

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.