zintani Posted October 6, 2011 Share Posted October 6, 2011 Hello, I would like to know how to calculate identical words in the same document without specifying it, just to do it automatically for all words found in that document. <?php /** * Information Retrievel * * Class used to explore information retrieval theory and concepts. */ define("DOC_ID", 0); define("TERM_POSITION", 1); class IR { public $num_docs = 0; public $corpus_terms = array(); /* * Show Documents * * Helper function that shows the contents of your corpus documents. * * @param array $D document corpus as array of strings */ function show_docs($D) { $ndocs = count($D); echo $ndocs; for($doc_num=0; $doc_num < $ndocs; $doc_num++) { ?> <p> Document #<?php echo ($doc_num+1); ?>:<br /> <?php echo $D[$doc_num]; ?> </p> <?php } } /* * Create Index * * Creates an inverted index from the supplied corpus documents. * Inverted index stored in corpus_terms array. * * @param array $D document corpus as array of strings */ function create_index($D) { $this->num_docs = count($D); for($doc_num=0; $doc_num < $this->num_docs; $doc_num++) { // zero array containing document terms $doc_terms = array(); // simplified word tokenization process $doc_terms = explode(" ", $D[$doc_num]); // here is where the indexing of terms to document locations happens $num_terms = count($doc_terms); for($term_position=0; $term_position < $num_terms; $term_position++) { $term = strtolower($doc_terms[$term_position]); $this->corpus_terms[$term][]=array($doc_num, $term_position); } } } /* * Show Index * * Helper function that outputs inverted index in a standard format. */ function show_index() { // sort by key for alphabetically ordered output ksort($this->corpus_terms); print_r ($this->corpus_terms); // output a representation of the inverted index foreach($this->corpus_terms AS $term => $doc_locations) { echo "<b>$term:</b> "; foreach($doc_locations AS $doc_location) echo "{".$doc_location[DOC_ID].", ".$doc_location[TERM_POSITION]."} "; echo "<br />"; } } /* * Term Frequency * * @param string $term * @return frequency of term in corpus */ function tf($term) { $term = strtolower($term); return count($this->corpus_terms[$term]); } /* * Number Documents With * * @param string $term * @return number of documents with term */ function ndw($term) { $term = strtolower($term); $doc_locations = $this->corpus_terms[$term]; $num_locations = count($doc_locations); $docs_with_term = array(); for($doc_location=0; $doc_location < $num_locations; $doc_location++) $docs_with_term[$i]++; return count($docs_with_term); } /* * Inverse Document Frequency * * @param string $term * @return inverse document frequency of term */ function idf($term) { return log(($this->num_docs)/$this->ndw($term)); } } $tf = $ir->tf($term); ?> This is the second code <?php include "php.php"; $D[0] = "Shipment of gold delivered in a fire delivery"; $D[1] = "Delivery of silver arrived in a silver truck of silver silver"; $D[2] = "Shipment of gold arrived in a silver truck"; $ir = new IR(); echo "<p><b>Corpus:</b></p>"; $ir->show_docs($D); $ir->create_index($D); echo "<p><b>Inverted Index:</b></p>"; $ir->show_index(); $term = "silver"; $tf = $ir->tf($term); $ndw = $ir->ndw($term); $idf = $ir->idf($term); echo "<p>"; echo "Term Frequency of '$term' is $tf<br />"; echo "Number Of Documents with $term is $ndw<br />"; echo "Inverse Document Frequency of $term is $idf"; echo "</p>"; ?> Instead of typing this Quote $term = "silver"; how can I do it automatically for all terms appear. Furthermore, how can I do it not for all document but one by one. The output should be Words of document D [0] is //$D[0] = "Shipment of gold delivered in a fire delivery by a delivery man"; shipment = 1 // one time written. of = 1. gold = 1 delivered =1 in =1 a = 2 fire=1 delivery = 2 by =1 and so on for the others. Link to comment https://forums.phpfreaks.com/topic/248545-tf-idf-code-help/ Share on other sites More sharing options...
Recommended Posts
Archived
This topic is now archived and is closed to further replies.