michaelhurleycaheragh Posted February 3, 2011 Share Posted February 3, 2011 Hi guys, im trying to parse a html table from an existing website to my own. However ive run into a few problems. Does anyone know how to parse html tables?? im using the PHP DOM Parser but at the moment i am only able to return all the data on the website rather then the specific table. Thanks for any help! Quote Link to comment https://forums.phpfreaks.com/topic/226599-php-parsing-html-table/ Share on other sites More sharing options...
Maq Posted February 3, 2011 Share Posted February 3, 2011 Can we see your current code? Quote Link to comment https://forums.phpfreaks.com/topic/226599-php-parsing-html-table/#findComment-1169545 Share on other sites More sharing options...
michaelhurleycaheragh Posted February 13, 2011 Author Share Posted February 13, 2011 Heres my code: <?php /******************************************************************************* Version: 1.11 ($Rev: 175 $) Website: http://sourceforge.net/projects/simplehtmldom/ Author: S.C. Chen <me578022@gmail.com> Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) Contributions by:   Yousuke Kumakura (Attribute filters)   Vadim Voituk (Negative indexes supports of "find" method)   Antcs (Constructor with automatically load contents either text or file/url) Licensed under The MIT License Redistributions of files must retain the above copyright notice. *******************************************************************************/ define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); define('HDOM_TYPE_TEXT',  3); define('HDOM_TYPE_ENDTAG', 4); define('HDOM_TYPE_ROOT',  5); define('HDOM_TYPE_UNKNOWN', 6); define('HDOM_QUOTE_DOUBLE', 0); define('HDOM_QUOTE_SINGLE', 1); define('HDOM_QUOTE_NO',  3); define('HDOM_INFO_BEGIN', 0); define('HDOM_INFO_END',  1); define('HDOM_INFO_QUOTE', 2); define('HDOM_INFO_SPACE', 3); define('HDOM_INFO_TEXT',  4); define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE',7); // helper functions // ----------------------------------------------------------------------------- // get html dom form file function file_get_html() {   $dom = new simple_html_dom;   $args = func_get_args();   $dom->load(call_user_func_array('file_get_contents', $args), true);   return $dom; } // get html dom form string function str_get_html($str, $lowercase=true) {   $dom = new simple_html_dom;   $dom->load($str, $lowercase);   return $dom; } // dump html dom tree function dump_html_tree($node, $show_attr=true, $deep=0) {   $lead = str_repeat('  ', $deep);   echo $lead.$node->tag;   if ($show_attr && count($node->attr)>0) {     echo '(';     foreach($node->attr as $k=>$v)       echo "[$k]=>\"".$node->$k.'", ';     echo ')';   }   echo "\n";   foreach($node->nodes as $c)     dump_html_tree($c, $show_attr, $deep+1); } // get dom form file (deprecated) function file_get_dom() {   $dom = new simple_html_dom;   $args = func_get_args();   $dom->load(call_user_func_array('file_get_contents', $args), true);   return $dom; } // get dom form string (deprecated) function str_get_dom($str, $lowercase=true) {   $dom = new simple_html_dom;   $dom->load($str, $lowercase);   return $dom; } // simple html dom node // ----------------------------------------------------------------------------- class simple_html_dom_node {   public $nodetype = HDOM_TYPE_TEXT;   public $tag = 'text';   public $attr = array();   public $children = array();   public $nodes = array();   public $parent = null;   public $_ = array();   private $dom = null;   function __construct($dom) {     $this->dom = $dom;     $dom->nodes[] = $this;   }   function __destruct() {     $this->clear();   }   function __toString() {     return $this->outertext();   }   // clean up memory due to php5 circular references memory leak...   function clear() {     $this->dom = null;     $this->nodes = null;     $this->parent = null;     $this->children = null;   }     // dump node's tree   function dump($show_attr=true) {     dump_html_tree($this, $show_attr);   }   // returns the parent of node   function parent() {     return $this->parent;   }   // returns children of node   function children($idx=-1) {     if ($idx===-1) return $this->children;     if (isset($this->children[$idx])) return $this->children[$idx];     return null;   }   // returns the first child of node   function first_child() {     if (count($this->children)>0) return $this->children[0];     return null;   }   // returns the last child of node   function last_child() {     if (($count=count($this->children))>0) return $this->children[$count-1];     return null;   }   // returns the next sibling of node    function next_sibling() {     if ($this->parent===null) return null;     $idx = 0;     $count = count($this->parent->children);     while ($idx<$count && $this!==$this->parent->children[$idx])       ++$idx;     if (++$idx>=$count) return null;     return $this->parent->children[$idx];   }   // returns the previous sibling of node   function prev_sibling() {     if ($this->parent===null) return null;     $idx = 0;     $count = count($this->parent->children);     while ($idx<$count && $this!==$this->parent->children[$idx])       ++$idx;     if (--$idx<0) return null;     return $this->parent->children[$idx];   }   // get dom node's inner html   function innertext() {     if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];     if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);     $ret = '';     foreach($this->nodes as $n)       $ret .= $n->outertext();     return $ret;   }   // get dom node's outer text (with tag)   function outertext() {     if ($this->tag==='root') return $this->innertext();     // trigger callback     if ($this->dom->callback!==null)       call_user_func_array($this->dom->callback, array($this));     if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];     if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);     // render begin tag     $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();     // render inner text     if (isset($this->_[HDOM_INFO_INNER]))       $ret .= $this->_[HDOM_INFO_INNER];     else {       foreach($this->nodes as $n)         $ret .= $n->outertext();     }     // render end tag     if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)       $ret .= '</'.$this->tag.'>';     return $ret;   }   // get dom node's plain text   function text() {     if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];     switch ($this->nodetype) {       case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);       case HDOM_TYPE_COMMENT: return '';       case HDOM_TYPE_UNKNOWN: return '';     }     if (strcasecmp($this->tag, 'script')===0) return '';     if (strcasecmp($this->tag, 'style')===0) return '';     $ret = '';     foreach($this->nodes as $n)       $ret .= $n->text();     return $ret;   }     function xmltext() {     $ret = $this->innertext();     $ret = str_ireplace('<![CDATA[', '', $ret);     $ret = str_replace(']]>', '', $ret);     return $ret;   }   // build node's text with tag   function makeup() {     // text, comment, unknown     if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);     $ret = '<'.$this->tag;     $i = -1;     foreach($this->attr as $key=>$val) {       ++$i;       // skip removed attribute       if ($val===null || $val===false)         continue;       $ret .= $this->_[HDOM_INFO_SPACE][$i][0];       //no value attr: nowrap, checked selected...       if ($val===true)         $ret .= $key;       else {         switch($this->_[HDOM_INFO_QUOTE][$i]) {           case HDOM_QUOTE_DOUBLE: $quote = '"'; break;           case HDOM_QUOTE_SINGLE: $quote = '\''; break;           default: $quote = '';         }         $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;       }     }     $ret = $this->dom->restore_noise($ret);     return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';   }   // find elements by css selector   function find($selector, $idx=null) {     $selectors = $this->parse_selector($selector);     if (($count=count($selectors))===0) return array();     $found_keys = array();     // find each selector     for ($c=0; $c<$count; ++$c) {       if (($levle=count($selectors[0]))===0) return array();       if (!isset($this->_[HDOM_INFO_BEGIN])) return array();       $head = array($this->_[HDOM_INFO_BEGIN]=>1);       // handle descendant selectors, no recursive!       for ($l=0; $l<$levle; ++$l) {         $ret = array();         foreach($head as $k=>$v) {           $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];           $n->seek($selectors[$c][$l], $ret);         }         $head = $ret;       }       foreach($head as $k=>$v) {         if (!isset($found_keys[$k]))           $found_keys[$k] = 1;       }     }     // sort keys     ksort($found_keys);     $found = array();     foreach($found_keys as $k=>$v)       $found[] = $this->dom->nodes[$k];     // return nth-element or array     if (is_null($idx)) return $found; else if ($idx<0) $idx = count($found) + $idx;     return (isset($found[$idx])) ? $found[$idx] : null;   }   // seek for given conditions   protected function seek($selector, &$ret) {     list($tag, $key, $val, $exp, $no_key) = $selector;     // xpath index     if ($tag && $key && is_numeric($key)) {       $count = 0;       foreach ($this->children as $c) {         if ($tag==='*' || $tag===$c->tag) {           if (++$count==$key) {             $ret[$c->_[HDOM_INFO_BEGIN]] = 1;             return;           }         }       }       return;     }     $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;     if ($end==0) {       $parent = $this->parent;       while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {         $end -= 1;         $parent = $parent->parent;       }       $end += $parent->_[HDOM_INFO_END];     }     for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {       $node = $this->dom->nodes[$i];       $pass = true;       if ($tag==='*' && !$key) {         if (in_array($node, $this->children, true))           $ret[$i] = 1;         continue;       }       // compare tag       if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}       // compare key       if ($pass && $key) {         if ($no_key) {           if (isset($node->attr[$key])) $pass=false;         }         else if (!isset($node->attr[$key])) $pass=false;       }       // compare value       if ($pass && $key && $val && $val!=='*') {         $check = $this->match($exp, $val, $node->attr[$key]);         // handle multiple class         if (!$check && strcasecmp($key, 'class')===0) {           foreach(explode(' ',$node->attr[$key]) as $k) {             $check = $this->match($exp, $val, $k);             if ($check) break;           }         }         if (!$check) $pass = false;       }       if ($pass) $ret[$i] = 1;       unset($node);     }   }   protected function match($exp, $pattern, $value) {     switch ($exp) {       case '=':         return ($value===$pattern);       case '!=':         return ($value!==$pattern);       case '^=':         return preg_match("/^".preg_quote($pattern,'/')."/", $value);       case '$=':         return preg_match("/".preg_quote($pattern,'/')."$/", $value);       case '*=':         if ($pattern[0]=='/')           return preg_match($pattern, $value);         return preg_match("/".$pattern."/i", $value);     }     return false;   }   protected function parse_selector($selector_string) {     // pattern of CSS selectors, modified from mootools     $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?[!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";     preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);     $selectors = array();     $result = array();     //print_r($matches);     foreach ($matches as $m) {       $m[0] = trim($m[0]);       if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;       // for borwser grnreated xpath       if ($m[1]==='tbody') continue;       list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);       if(!empty($m[2])) {$key='id'; $val=$m[2];}       if(!empty($m[3])) {$key='class'; $val=$m[3];}       if(!empty($m[4])) {$key=$m[4];}       if(!empty($m[5])) {$exp=$m[5];}       if(!empty($m[6])) {$val=$m[6];}       // convert to lowercase       if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}       //elements that do NOT have the specified attribute       if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}       $result[] = array($tag, $key, $val, $exp, $no_key);       if (trim($m[7])===',') {         $selectors[] = $result;         $result = array();       }     }     if (count($result)>0)       $selectors[] = $result;     return $selectors;   }   function __get($name) {     if (isset($this->attr[$name])) return $this->attr[$name];     switch($name) {       case 'outertext': return $this->outertext();       case 'innertext': return $this->innertext();       case 'plaintext': return $this->text();       case 'xmltext': return $this->xmltext();       default: return array_key_exists($name, $this->attr);     }   }   function __set($name, $value) {     switch($name) {       case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;       case 'innertext':         if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;         return $this->_[HDOM_INFO_INNER] = $value;     }     if (!isset($this->attr[$name])) {       $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');       $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;     }     $this->attr[$name] = $value;   }   function __isset($name) {     switch($name) {       case 'outertext': return true;       case 'innertext': return true;       case 'plaintext': return true;     }     //no value attr: nowrap, checked selected...     return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);   }   function __unset($name) {     if (isset($this->attr[$name]))       unset($this->attr[$name]);   }   // camel naming conventions   function getAllAttributes() {return $this->attr;}   function getAttribute($name) {return $this->__get($name);}   function setAttribute($name, $value) {$this->__set($name, $value);}   function hasAttribute($name) {return $this->__isset($name);}   function removeAttribute($name) {$this->__set($name, null);}   function getElementById($id) {return $this->find("#$id", 0);}   function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}   function getElementByTagName($name) {return $this->find($name, 0);}   function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}   function parentNode() {return $this->parent();}   function childNodes($idx=-1) {return $this->children($idx);}   function firstChild() {return $this->first_child();}   function lastChild() {return $this->last_child();}   function nextSibling() {return $this->next_sibling();}   function previousSibling() {return $this->prev_sibling();} } // simple html dom parser // ----------------------------------------------------------------------------- class simple_html_dom {   public $root = null;   public $nodes = array();   public $callback = null;   public $lowercase = false;   protected $pos;   protected $doc;   protected $char;   protected $size;   protected $cursor;   protected $parent;   protected $noise = array();   protected $token_blank = " \t\r\n";   protected $token_equal = ' =/>';   protected $token_slash = " />\r\n\t";   protected $token_attr = ' >';   // use isset instead of in_array, performance boost about 30%...   protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);   protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);   protected $optional_closing_tags = array(     'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),     'th'=>array('th'=>1),     'td'=>array('td'=>1),     'li'=>array('li'=>1),     'dt'=>array('dt'=>1, 'dd'=>1),     'dd'=>array('dd'=>1, 'dt'=>1),     'dl'=>array('dd'=>1, 'dt'=>1),     'p'=>array('p'=>1),     'nobr'=>array('nobr'=>1),   );   function __construct($str=null) {     if ($str) {       if (preg_match("/^http:\/\//i",$str) || is_file($str))         $this->load_file($str);       else         $this->load($str);     }   }   function __destruct() {     $this->clear();   }   // load html from string   function load($str, $lowercase=true) {     // prepare     $this->prepare($str, $lowercase);     // strip out comments     $this->remove_noise("'<!--(.*?)-->'is");     // strip out cdata     $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);     // strip out <style> tags     $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");     $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");     // strip out <script> tags     $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");     $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");     // strip out preformatted tags     $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");     // strip out server side scripts     $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);     // strip smarty scripts     $this->remove_noise("'(\{\w)(.*?)(\})'s", true);     // parsing     while ($this->parse());     // end     $this->root->_[HDOM_INFO_END] = $this->cursor;   }   // load html from file   function load_file() {     $args = func_get_args();     $this->load(call_user_func_array('file_get_contents', $args), true);   }   // set callback function   function set_callback($function_name) {     $this->callback = $function_name;   }   // remove callback function   function remove_callback() {     $this->callback = null;   }   // save dom as string   function save($filepath='') {     $ret = $this->root->innertext();     if ($filepath!=='') file_put_contents($filepath, $ret);     return $ret;   }   // find dom node by css selector   function find($selector, $idx=null) {     return $this->root->find($selector, $idx);   }   // clean up memory due to php5 circular references memory leak...   function clear() {     foreach($this->nodes as $n) {$n->clear(); $n = null;}     if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}     if (isset($this->root)) {$this->root->clear(); unset($this->root);}     unset($this->doc);     unset($this->noise);   }     function dump($show_attr=true) {     $this->root->dump($show_attr);   }   // prepare HTML data and init everything   protected function prepare($str, $lowercase=true) {     $this->clear();     $this->doc = $str;     $this->pos = 0;     $this->cursor = 1;     $this->noise = array();     $this->nodes = array();     $this->lowercase = $lowercase;     $this->root = new simple_html_dom_node($this);     $this->root->tag = 'root';     $this->root->_[HDOM_INFO_BEGIN] = -1;     $this->root->nodetype = HDOM_TYPE_ROOT;     $this->parent = $this->root;     // set the length of content     $this->size = strlen($str);     if ($this->size>0) $this->char = $this->doc[0];   }   // parse html content   protected function parse() {     if (($s = $this->copy_until_char('<'))==='')       return $this->read_tag();     // text     $node = new simple_html_dom_node($this);     ++$this->cursor;     $node->_[HDOM_INFO_TEXT] = $s;     $this->link_nodes($node, false);     return true;   }   // read tag info   protected function read_tag() {     if ($this->char!=='<') {       $this->root->_[HDOM_INFO_END] = $this->cursor;       return false;     }     $begin_tag_pos = $this->pos;     $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next     // end tag     if ($this->char==='/') {       $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next       $this->skip($this->token_blank_t);       $tag = $this->copy_until_char('>');       // skip attributes in end tag       if (($pos = strpos($tag, ' '))!==false)         $tag = substr($tag, 0, $pos);       $parent_lower = strtolower($this->parent->tag);       $tag_lower = strtolower($tag);       if ($parent_lower!==$tag_lower) {         if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {           $this->parent->_[HDOM_INFO_END] = 0;           $org_parent = $this->parent;           while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)             $this->parent = $this->parent->parent;           if (strtolower($this->parent->tag)!==$tag_lower) {             $this->parent = $org_parent; // restore origonal parent             if ($this->parent->parent) $this->parent = $this->parent->parent;             $this->parent->_[HDOM_INFO_END] = $this->cursor;             return $this->as_text_node($tag);           }         }         else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {           $this->parent->_[HDOM_INFO_END] = 0;           $org_parent = $this->parent;           while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)             $this->parent = $this->parent->parent;           if (strtolower($this->parent->tag)!==$tag_lower) {             $this->parent = $org_parent; // restore origonal parent             $this->parent->_[HDOM_INFO_END] = $this->cursor;             return $this->as_text_node($tag);           }         }         else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) {           $this->parent->_[HDOM_INFO_END] = 0;           $this->parent = $this->parent->parent;         }         else           return $this->as_text_node($tag);       }       $this->parent->_[HDOM_INFO_END] = $this->cursor;       if ($this->parent->parent) $this->parent = $this->parent->parent;       $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next       return true;     }     $node = new simple_html_dom_node($this);     $node->_[HDOM_INFO_BEGIN] = $this->cursor;     ++$this->cursor;     $tag = $this->copy_until($this->token_slash);     // doctype, cdata & comments...     if (isset($tag[0]) && $tag[0]==='!') {       $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');       if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {         $node->nodetype = HDOM_TYPE_COMMENT;         $node->tag = 'comment';       } else {         $node->nodetype = HDOM_TYPE_UNKNOWN;         $node->tag = 'unknown';       }       if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';       $this->link_nodes($node, true);       $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next       return true;     }     // text     if ($pos=strpos($tag, '<')!==false) {       $tag = '<' . substr($tag, 0, -1);       $node->_[HDOM_INFO_TEXT] = $tag;       $this->link_nodes($node, false);       $this->char = $this->doc[--$this->pos]; // prev       return true;     }     if (!preg_match("/^[\w-:]+$/", $tag)) {       $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');       if ($this->char==='<') {         $this->link_nodes($node, false);         return true;       }       if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';       $this->link_nodes($node, false);       $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next       return true;     }     // begin tag     $node->nodetype = HDOM_TYPE_ELEMENT;     $tag_lower = strtolower($tag);     $node->tag = ($this->lowercase) ? $tag_lower : $tag;     // handle optional closing tags     if (isset($this->optional_closing_tags[$tag_lower]) ) {       while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {         $this->parent->_[HDOM_INFO_END] = 0;         $this->parent = $this->parent->parent;       }       $node->parent = $this->parent;     }     $guard = 0; // prevent infinity loop     $space = array($this->copy_skip($this->token_blank), '', '');     // attributes     do {       if ($this->char!==null && $space[0]==='') break;       $name = $this->copy_until($this->token_equal);       if($guard===$this->pos) {         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next         continue;       }       $guard = $this->pos;       // handle endless '<'       if($this->pos>=$this->size-1 && $this->char!=='>') {         $node->nodetype = HDOM_TYPE_TEXT;         $node->_[HDOM_INFO_END] = 0;         $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;         $node->tag = 'text';         $this->link_nodes($node, false);         return true;       }       // handle mismatch '<'       if($this->doc[$this->pos-1]=='<') {         $node->nodetype = HDOM_TYPE_TEXT;         $node->tag = 'text';         $node->attr = array();         $node->_[HDOM_INFO_END] = 0;         $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);         $this->pos -= 2;         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next         $this->link_nodes($node, false);         return true;       }       if ($name!=='/' && $name!=='') {         $space[1] = $this->copy_skip($this->token_blank);         $name = $this->restore_noise($name);         if ($this->lowercase) $name = strtolower($name);         if ($this->char==='=') {           $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next           $this->parse_attr($node, $name, $space);         }         else {           //no value attr: nowrap, checked selected...           $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;           $node->attr[$name] = true;           if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev         }         $node->_[HDOM_INFO_SPACE][] = $space;         $space = array($this->copy_skip($this->token_blank), '', '');       }       else         break;     } while($this->char!=='>' && $this->char!=='/');     $this->link_nodes($node, true);     $node->_[HDOM_INFO_ENDSPACE] = $space[0];     // check self closing     if ($this->copy_until_char_escape('>')==='/') {       $node->_[HDOM_INFO_ENDSPACE] .= '/';       $node->_[HDOM_INFO_END] = 0;     }     else {       // reset parent       if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;     }     $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next     return true;   }   // parse attributes   protected function parse_attr($node, $name, &$space) {     $space[2] = $this->copy_skip($this->token_blank);     switch($this->char) {       case '"':         $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next         $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next         break;       case '\'':         $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next         $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next         break;       default:         $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;         $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));     }   }   // link node's parent   protected function link_nodes(&$node, $is_child) {     $node->parent = $this->parent;     $this->parent->nodes[] = $node;     if ($is_child)       $this->parent->children[] = $node;   }   // as a text node   protected function as_text_node($tag) {     $node = new simple_html_dom_node($this);     ++$this->cursor;     $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';     $this->link_nodes($node, false);     $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next     return true;   }   protected function skip($chars) {     $this->pos += strspn($this->doc, $chars, $this->pos);     $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next   }   protected function copy_skip($chars) {     $pos = $this->pos;     $len = strspn($this->doc, $chars, $pos);     $this->pos += $len;     $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next     if ($len===0) return '';     return substr($this->doc, $pos, $len);   }   protected function copy_until($chars) {     $pos = $this->pos;     $len = strcspn($this->doc, $chars, $pos);     $this->pos += $len;     $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next     return substr($this->doc, $pos, $len);   }   protected function copy_until_char($char) {     if ($this->char===null) return '';     if (($pos = strpos($this->doc, $char, $this->pos))===false) {       $ret = substr($this->doc, $this->pos, $this->size-$this->pos);       $this->char = null;       $this->pos = $this->size;       return $ret;     }     if ($pos===$this->pos) return '';     $pos_old = $this->pos;     $this->char = $this->doc[$pos];     $this->pos = $pos;     return substr($this->doc, $pos_old, $pos-$pos_old);   }   protected function copy_until_char_escape($char) {     if ($this->char===null) return '';     $start = $this->pos;     while(1) {       if (($pos = strpos($this->doc, $char, $start))===false) {         $ret = substr($this->doc, $this->pos, $this->size-$this->pos);         $this->char = null;         $this->pos = $this->size;         return $ret;       }       if ($pos===$this->pos) return '';       if ($this->doc[$pos-1]==='\\') {         $start = $pos+1;         continue;       }       $pos_old = $this->pos;       $this->char = $this->doc[$pos];       $this->pos = $pos;       return substr($this->doc, $pos_old, $pos-$pos_old);     }   }   // remove noise from html content   protected function remove_noise($pattern, $remove_tag=false) {     $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);     for ($i=$count-1; $i>-1; --$i) {       $key = '___noise___'.sprintf('% 3d', count($this->noise)+100);       $idx = ($remove_tag) ? 0 : 1;       $this->noise[$key] = $matches[$i][$idx][0];       $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));     }     // reset the length of content     $this->size = strlen($this->doc);     if ($this->size>0) $this->char = $this->doc[0];   }   // restore noise to html content   function restore_noise($text) {     while(($pos=strpos($text, '___noise___'))!==false) {       $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13];       if (isset($this->noise[$key]))         $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14);     }     return $text;   }   function __toString() {     return $this->root->innertext();   }   function __get($name) {     switch($name) {       case 'outertext': return $this->root->innertext();       case 'innertext': return $this->root->innertext();       case 'plaintext': return $this->root->text();     }   }   // camel naming conventions   function childNodes($idx=-1) {return $this->root->childNodes($idx);}   function firstChild() {return $this->root->first_child();}   function lastChild() {return $this->root->last_child();}   function getElementById($id) {return $this->find("#$id", 0);}   function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}   function getElementByTagName($name) {return $this->find($name, 0);}   function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}   function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);} } //echo file_get_html('http://www.bordbia.ie/industryservices/cattle/pages/default.aspx')->plaintext; include_once('simple_html_dom.php'); echo $html = file_get_html('http://www.bordbia.ie/industryservices/cattle/pages/default.aspx'); //$es = $html->find('CommodityListTable')->plaintext; //echo $es; ?> Quote Link to comment https://forums.phpfreaks.com/topic/226599-php-parsing-html-table/#findComment-1173848 Share on other sites More sharing options...
michaelhurleycaheragh Posted February 14, 2011 Author Share Posted February 14, 2011 I forgot to add that i the code is doing something. It's returning all the details on the specified webpage, but its not returning just the specified table which i require. Any help appreciated!!! Quote Link to comment https://forums.phpfreaks.com/topic/226599-php-parsing-html-table/#findComment-1173851 Share on other sites More sharing options...
kenrbnsn Posted February 14, 2011 Share Posted February 14, 2011 The next time you post code to this forum, please put the code between tags. Â Ken Quote Link to comment https://forums.phpfreaks.com/topic/226599-php-parsing-html-table/#findComment-1173918 Share on other sites More sharing options...
Recommended Posts
Join the conversation
You can post now and register later. If you have an account, sign in now to post with your account.