Jump to content

Code not working and no error output??


mikebyrne

Recommended Posts

I run the code to take text from a pdf and place it into a text file but I get a blank screen with no errors.

 

The textfile is not created on the either??

 

Any ideas why its not working?

 

<?php
// Function    : pdf2txt()
// Arguments   : $filename - Filename of the PDF you want to extract
// Description : Reads a pdf file, extracts data streams, and manages
//               their translation to plain text - returning the plain
//               text at the end
// Authors      : Jonathan Beckett, 2005-05-02
//                            : Sven Schuberth, 2007-03-29

function pdf2txt($filename){    

    $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf');

    $data = getFileData($filename);
   
    $s=strpos($data,"%")+1;
   
    $version=substr($data,$s,strpos($data,"%",$s)-1);
    if(substr_count($version,"PDF-1.2")==0)
        return handleV3($data);
    else
        return handleV2($data);

   
}
// handles the verson 1.2
function handleV2($data){
       
    // grab objects and then grab their contents (chunks)
    $a_obj = getDataArray($data,"obj","endobj");
   
    foreach($a_obj as $obj){
       
        $a_filter = getDataArray($obj,"<<",">>");
   
        if (is_array($a_filter)){
            $j++;
            $a_chunks[$j]["filter"] = $a_filter[0];

            $a_data = getDataArray($obj,"stream\r\n","endstream");
            if (is_array($a_data)){
                $a_chunks[$j]["data"] = substr($a_data[0],
strlen("stream\r\n"),
strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
            }
        }
    }

    // decode the chunks
    foreach($a_chunks as $chunk){

        // look at each chunk and decide how to decode it - by looking at the contents of the filter
        $a_filter = split("/",$chunk["filter"]);
       
        if ($chunk["data"]!=""){
            // look at the filter to find out which encoding has been used           
            if (substr($chunk["filter"],"FlateDecode")!==false){
                $data =@ gzuncompress($chunk["data"]);
                if (trim($data)!=""){
                    $result_data .= ps2txt($data);
                } else {
               
                    //$result_data .= "x";
                }
            }
        }
    }
   
    return $result_data;
}

//handles versions >1.2
function handleV3($data){
    // grab objects and then grab their contents (chunks)
    $a_obj = getDataArray($data,"obj","endobj");
    $result_data="";
    foreach($a_obj as $obj){
        //check if it a string
        if(substr_count($obj,"/GS1")>0){
            //the strings are between ( and )
            preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
            if(is_array($field))
                foreach($field as $data)
                    $result_data.=$data[1];
        }
    }
    return $result_data;
    
}

function ps2txt($ps_data){
    $result = "";
    $a_data = getDataArray($ps_data,"[","]");
    if (is_array($a_data)){
        foreach ($a_data as $ps_text){
            $a_text = getDataArray($ps_text,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
    } else {
        // the data may just be in raw format (outside of [] tags)
        $a_text = getDataArray($ps_data,"(",")");
        if (is_array($a_text)){
            foreach ($a_text as $text){
                $result .= substr($text,1,strlen($text)-2);
            }
        }
    }
    return $result;
}

function getFileData($filename){
    $handle = fopen($filename,"rb");
    $data = fread($handle, filesize($filename));
    fclose($handle);
    return $data;
}

function getDataArray($data,$start_word,$end_word){

    $start = 0;
    $end = 0;
    unset($a_result);
   
    while ($start!==false && $end!==false){
        $start = strpos($data,$start_word,$end);
        if ($start!==false){
            $end = strpos($data,$end_word,$start);
            if ($end!==false){
                // data is between start and end
                $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
            }
        }
    }
    return $a_result;
}
error_reporting(E_ALL);
ini_set('display_errors', TRUE);

// modify the paths
file_put_contents('C:\Users\Mike\Desktop\txtfile.txt', pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf'));
?>

Link to comment
https://forums.phpfreaks.com/topic/165678-code-not-working-and-no-error-output/
Share on other sites

replace

 

// modify the paths
file_put_contents('C:\Users\Mike\Desktop\txtfile.txt', pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf'));

 

with

 

file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf'));

 

??

 

 

Yes and move the Athy Register.pdf to your directory where your .php file resides.

Im getting the following output

 

>"); if (is_array($a_filter)){ $j++; $a_chunks[$j]["filter"] = $a_filter[0]; $a_data = getDataArray($obj,"stream\r\n","endstream"); if (is_array($a_data)){ $a_chunks[$j]["data"] = substr($a_data[0], strlen("stream\r\n"), strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream")); } } } // decode the chunks foreach($a_chunks as $chunk){ // look at each chunk and decide how to decode it - by looking at the contents of the filter $a_filter = split("/",$chunk["filter"]); if ($chunk["data"]!=""){ // look at the filter to find out which encoding has been used if (substr($chunk["filter"],"FlateDecode")!==false){ $data =@ gzuncompress($chunk["data"]); if (trim($data)!=""){ $result_data .= ps2txt($data); } else { //$result_data .= "x"; } } } } return $result_data; } //handles versions >1.2 function handleV3($data){ // grab objects and then grab their contents (chunks) $a_obj = getDataArray($data,"obj","endobj"); $result_data=""; foreach($a_obj as $obj){ //check if it a string if(substr_count($obj,"/GS1")>0){ //the strings are between ( and ) preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER); if(is_array($field)) foreach($field as $data) $result_data.=$data[1]; } } return $result_data; } function ps2txt($ps_data){ $result = ""; $a_data = getDataArray($ps_data,"[","]"); if (is_array($a_data)){ foreach ($a_data as $ps_text){ $a_text = getDataArray($ps_text,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } } else { // the data may just be in raw format (outside of [] tags) $a_text = getDataArray($ps_data,"(",")"); if (is_array($a_text)){ foreach ($a_text as $text){ $result .= substr($text,1,strlen($text)-2); } } } return $result; } function getFileData($filename){ $handle = fopen($filename,"rb"); $data = fread($handle, filesize($filename)); fclose($handle); return $data; } function getDataArray($data,$start_word,$end_word){ $start = 0; $end = 0; unset($a_result); while ($start!==false && $end!==false){ $start = strpos($data,$start_word,$end); if ($start!==false){ $end = strpos($data,$end_word,$start); if ($end!==false){ // data is between start and end $a_result[] = substr($data,$start,$end-$start+strlen($end_word)); } } } return $a_result; } error_reporting(E_ALL); ini_set('display_errors', TRUE); file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf')); ?>

I've used php for quite a whhile now but this think is driving me mental!

 

I didn't remove any php tags and  the output is in the browser

 

The code looks like this

 

<?php
// Function    : pdf2txt()
// Arguments   : $filename - Filename of the PDF you want to extract
// Description : Reads a pdf file, extracts data streams, and manages
//               their translation to plain text - returning the plain
//               text at the end
// Authors      : Jonathan Beckett, 2005-05-02
//                            : Sven Schuberth, 2007-03-29

function pdf2txt($filename){    

    $pdftext = pdf2txt('C:\Users\Mike\Desktop\Athy Database\Athy Register.pdf');

    $data = getFileData($filename);
   
    $s=strpos($data,"%")+1;
   
    $version=substr($data,$s,strpos($data,"%",$s)-1);
    if(substr_count($version,"PDF-1.2")==0)
        return handleV3($data);
    else
        return handleV2($data);

   
}
// handles the verson 1.2
function handleV2($data){
       
    // grab objects and then grab their contents (chunks)
    $a_obj = getDataArray($data,"obj","endobj");
   
    foreach($a_obj as $obj){
       
        $a_filter = getDataArray($obj,"<<",">>");
   
        if (is_array($a_filter)){
            $j++;
            $a_chunks[$j]["filter"] = $a_filter[0];

            $a_data = getDataArray($obj,"stream\r\n","endstream");
            if (is_array($a_data)){
                $a_chunks[$j]["data"] = substr($a_data[0],
strlen("stream\r\n"),
strlen($a_data[0])-strlen("stream\r\n")-strlen("endstream"));
            }
        }
    }

    // decode the chunks
    foreach($a_chunks as $chunk){

        // look at each chunk and decide how to decode it - by looking at the contents of the filter
        $a_filter = split("/",$chunk["filter"]);
       
        if ($chunk["data"]!=""){
            // look at the filter to find out which encoding has been used           
            if (substr($chunk["filter"],"FlateDecode")!==false){
                $data =@ gzuncompress($chunk["data"]);
                if (trim($data)!=""){
                    $result_data .= ps2txt($data);
                } else {
               
                    //$result_data .= "x";
                }
            }
        }
    }
   
    return $result_data;
}

//handles versions >1.2
function handleV3($data){
    // grab objects and then grab their contents (chunks)
    $a_obj = getDataArray($data,"obj","endobj");
    $result_data="";
    foreach($a_obj as $obj){
        //check if it a string
        if(substr_count($obj,"/GS1")>0){
            //the strings are between ( and )
            preg_match_all("|\((.*?)\)|",$obj,$field,PREG_SET_ORDER);
            if(is_array($field))
                foreach($field as $data)
                    $result_data.=$data[1];
        }
    }
    return $result_data;
    
}

function ps2txt($ps_data){
    $result = "";
    $a_data = getDataArray($ps_data,"[","]");
    if (is_array($a_data)){
        foreach ($a_data as $ps_text){
            $a_text = getDataArray($ps_text,"(",")");
            if (is_array($a_text)){
                foreach ($a_text as $text){
                    $result .= substr($text,1,strlen($text)-2);
                }
            }
        }
    } else {
        // the data may just be in raw format (outside of [] tags)
        $a_text = getDataArray($ps_data,"(",")");
        if (is_array($a_text)){
            foreach ($a_text as $text){
                $result .= substr($text,1,strlen($text)-2);
            }
        }
    }
    return $result;
}

function getFileData($filename){
    $handle = fopen($filename,"rb");
    $data = fread($handle, filesize($filename));
    fclose($handle);
    return $data;
}

function getDataArray($data,$start_word,$end_word){

    $start = 0;
    $end = 0;
    unset($a_result);
   
    while ($start!==false && $end!==false){
        $start = strpos($data,$start_word,$end);
        if ($start!==false){
            $end = strpos($data,$end_word,$start);
            if ($end!==false){
                // data is between start and end
                $a_result[] = substr($data,$start,$end-$start+strlen($end_word));
            }
        }
    }
    return $a_result;
}
error_reporting(E_ALL);
ini_set('display_errors', TRUE);

file_put_contents('txtfile.txt', pdf2txt('Athy Register.pdf'));
?>

 

 

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.