|
<?php defined('SYSPATH') or die('No direct script access.');
|
|
/*
|
|
* This file is part of open source system FreenetIS
|
|
* and it is released under GPLv3 licence.
|
|
*
|
|
* More info about licence can be found:
|
|
* http://www.gnu.org/licenses/gpl-3.0.html
|
|
*
|
|
* More info about project can be found:
|
|
* http://www.freenetis.org/
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* Parser_Html_Table is ABSTRACT class containing methods useful
|
|
* for parsing HTML tables in generic HTML files.
|
|
*
|
|
* Motivation: we want to parse HTML tables to get interesting data from various web sites.
|
|
* The HTML code of the tables often does not conforms to XML/XHTML rules.
|
|
* It often does not conform even HTML4, e.g. - the table row is not closed by </tr>,
|
|
* table cell is not closed by </td> etc.
|
|
* Therefore, XML parsers can't be used for this.
|
|
* The Tidy extension is not available on all hostings.
|
|
* If you think about parsing a non-XHTML non-HTML4.0 table, look at this class.
|
|
* The methods have been optimized to give maximum possible performance
|
|
* and memory efficiency.
|
|
* For an example how to use this class, see the Parser_Ebanka class
|
|
*
|
|
* @author Tomas <Dulik at unart dot cz>
|
|
* @version 1.0
|
|
*/
|
|
abstract class Parser_Html_Table
|
|
{
|
|
const TIMEOUT = 3;
|
|
|
|
/**
|
|
* File descriptor
|
|
*
|
|
* @var resource
|
|
*/
|
|
protected $file;
|
|
|
|
/**
|
|
* Charset
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $charset;
|
|
|
|
/**
|
|
* Buffer
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $buffer;
|
|
|
|
/**
|
|
* The position of the last End Of Line in the buffer
|
|
*
|
|
* @var integer
|
|
*/
|
|
protected $eoln_pos;
|
|
|
|
/**
|
|
* Var for transfering matched items from preg_match
|
|
*
|
|
* @var array
|
|
*/
|
|
protected $matches;
|
|
|
|
/**
|
|
* Table end indicator
|
|
*
|
|
* @var bool
|
|
*/
|
|
protected $table_end = false;
|
|
|
|
/**
|
|
* Opens URL
|
|
*
|
|
* @param string $url
|
|
*/
|
|
public function open($url)
|
|
{
|
|
if ($url != "")
|
|
{
|
|
$old = ini_set('default_socket_timeout', self::TIMEOUT);
|
|
|
|
if (($this->file = fopen($url, "rb")) === false)
|
|
die("Can not open file! Check if $url exists!");
|
|
|
|
ini_set('default_socket_timeout', $old);
|
|
stream_set_timeout($this->file, self::TIMEOUT);
|
|
//stream_set_blocking($this->file, 0);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* get_line appends **AT LEAST** one line from the $file into the $buffer.
|
|
*
|
|
*
|
|
* @return boolean
|
|
* @uses buffer, eoln_pos;
|
|
* In PHP4, this is MUCH faster than using fgets because of a PHP bug.
|
|
* In PHP5, this is usualy still faster than the following version based on fgets:
|
|
*
|
|
* protected function get_line_fgets() {
|
|
* if (!feof($this->file))
|
|
* $this->buffer .= fgets($this->file);
|
|
* else return false;
|
|
* $this->eoln_pos=strlen($this->buffer);
|
|
* return true;
|
|
* }
|
|
*
|
|
* Note for HTML files with super long lines (hundreds of kbytes without single
|
|
* EOLN) the fgets would be useless - it'd take a lot of memory to read a single line!
|
|
* For such files, you should modify the code of my function this way:
|
|
* Replace
|
|
* ...eoln_pos=strripos($this->buffer,"\n"))
|
|
* by something like
|
|
* ...eoln_pos=find_row_end()
|
|
*/
|
|
public function get_line()
|
|
{
|
|
while (!feof($this->file))
|
|
{
|
|
// read 8192 bytes from file or one packet
|
|
$new_part = fread($this->file, 8192);
|
|
$this->buffer .= $new_part;
|
|
|
|
// search eoln from end: found ?
|
|
if (($this->eoln_pos = strripos($this->buffer, "\n")) !== false)
|
|
{
|
|
// eoln found! done, OK...
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// EOF happened ?
|
|
if (!isset($new_part))
|
|
{
|
|
// EOF right when the function begun? Return EOF!
|
|
return false;
|
|
}
|
|
|
|
// EOF happened but no EOLN
|
|
$this->eoln_pos = strlen($this->buffer); // set eoln_pos to EOF...
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* find_tag_and_trim($tag) tries to find the tag in the $this->buffer
|
|
* and trim the beginning of the buffer till (and including) the $tag
|
|
* returns false if string not found.
|
|
* returns true if string found, and the variable $this->buffer contains
|
|
* string trimmed from the first occurence of $tag
|
|
*/
|
|
protected function find_tag_and_trim($tag)
|
|
{
|
|
$found = false;
|
|
do
|
|
{
|
|
// can you find the tag ?
|
|
if (($pos = stripos($this->buffer, $tag)) !== false)
|
|
{
|
|
$found = true;
|
|
// set the cut $pos(ition) behind $tag
|
|
$pos += strlen($tag);
|
|
// now cut away everything from the beginning till the cut position
|
|
$this->buffer = substr($this->buffer, $pos);
|
|
// and update the counters
|
|
$this->eoln_pos -= $pos;
|
|
}
|
|
// tag not found and eoln found previously?
|
|
else if ($this->eoln_pos > 0)
|
|
{
|
|
// cut away all from beginning till eoln
|
|
$this->buffer = substr($this->buffer, $this->eoln_pos);
|
|
// so we don't have to deal with these lines again
|
|
$this->eoln_pos = 0;
|
|
}
|
|
}
|
|
while (!$found && $this->get_line());
|
|
|
|
return $found;
|
|
}
|
|
|
|
/**
|
|
* The same as previous function, but for multiple tags search.
|
|
* If tag is found, returns the tag index in the $tags array.
|
|
* If tag is not found, returns number of $tags+1
|
|
*
|
|
* @param array $tags
|
|
* @return integer
|
|
*/
|
|
protected function find_tags_and_trim($tags)
|
|
{
|
|
$found = false;
|
|
do
|
|
{
|
|
$i = 0;
|
|
// for all the tags do:
|
|
foreach ($tags as $tag)
|
|
{
|
|
// can you find the startag ?
|
|
if (($pos = stripos($this->buffer, $tag)) !== false)
|
|
{
|
|
$found = true;
|
|
// set the cut $pos(ition) behind $tag
|
|
$pos+=strlen($tag);
|
|
// now cut away everything from the beginning till the cut position
|
|
$this->buffer = substr($this->buffer, $pos);
|
|
// and update the counters
|
|
$this->eoln_pos -= $pos;
|
|
break;
|
|
// this tag not found - increment cntr and try another one
|
|
}
|
|
else
|
|
{
|
|
$i++;
|
|
}
|
|
}
|
|
|
|
// tags not found and eoln found previously?
|
|
if (!$found && $this->eoln_pos > 0)
|
|
{
|
|
// cut away all from beginning till eoln
|
|
$this->buffer = substr($this->buffer, $this->eoln_pos);
|
|
$this->eoln_pos = 0;
|
|
}
|
|
}
|
|
while (!$found && $this->get_line());
|
|
|
|
return $i;
|
|
}
|
|
|
|
/**
|
|
* this functions tries to find the end of table row.
|
|
* It can handle even rows terminated incorrectly by
|
|
* </table> instead of </tr>
|
|
*
|
|
* @return integer The position of the end row tag (</tr> or </table>)
|
|
* or false if the tag is not found.
|
|
*/
|
|
protected function find_row_end()
|
|
{
|
|
/**
|
|
* PHP5 version: in PHP5, strripos can search whole string,
|
|
* not only 1 char as in PHP4
|
|
*/
|
|
if (($res = stripos($this->buffer, "<table")) !== false ||
|
|
($res = stripos($this->buffer, "</table")) !== false)
|
|
{
|
|
$this->table_end = true;
|
|
return $res;
|
|
}
|
|
|
|
if (($res = strripos($this->buffer, "</tr")) !== false)
|
|
return $res;
|
|
|
|
return strripos($this->buffer, "<tr");
|
|
|
|
/**
|
|
* PHP4 version: we have to use perl regular expressions...
|
|
* This is only 0.03sec/100kB slower than PHP5 strripos version
|
|
*
|
|
|
|
$matchCnt=preg_match("/<[\/]?(?:tr|table)(?!.*<[\/]?(tr|table))/si",$this->buffer, $matches, PREG_OFFSET_CAPTURE);
|
|
if ($matchCnt==1) return $matches[0][1];
|
|
else return false;
|
|
*/
|
|
}
|
|
|
|
/**
|
|
* get_table_rows tries to fill the buffer with at least one table row (<tr>...<[/]tr>) string.
|
|
* It then parses the rows using a regular expression, which returns the content of the
|
|
* table cells in the $this->matches array
|
|
* Because fread reads whole blocks, it is possible this
|
|
*
|
|
* @return bool
|
|
*/
|
|
protected function get_table_rows()
|
|
{
|
|
// Try to find the starting <tr> tag:
|
|
if (!$this->find_tag_and_trim("<tr"))
|
|
return false;
|
|
|
|
// now try to find the last <[/]tr> or <[/]table> tag by searching these
|
|
// tags not followed by the same tags, if not successfull, read the
|
|
// next line of the file. Do it until EOF or table end
|
|
|
|
while (($lastTagPos = $this->find_row_end()) === false &&
|
|
$this->table_end == false &&
|
|
$this->get_line());
|
|
|
|
// if <tr> not found untill EOF, return EOF
|
|
if ($lastTagPos === false)
|
|
return false;
|
|
|
|
// $rows is string containing several <tr>...<tr>... ended by <tr>
|
|
$rows = substr($this->buffer, 0, $lastTagPos);
|
|
|
|
// if HTML charset is not UTF-8
|
|
if (strcasecmp($this->charset, "utf-8") != 0)
|
|
{
|
|
// convert it to UTF-8
|
|
$rows = iconv($this->charset, "UTF-8", $rows);
|
|
}
|
|
|
|
// Now: get the contents of all the table cells (the texts between
|
|
// <td > and <td > or </td> or <tr> or </tr> tags
|
|
preg_match_all("/<td[^>]*>(?:<[^>]*>)?(.*?)<(?:(?:\/)?td|tr|table)/si", $rows, $this->matches);
|
|
|
|
$this->buffer = substr($this->buffer, $lastTagPos);
|
|
|
|
if ($this->eoln_pos > $lastTagPos)
|
|
{
|
|
$this->eoln_pos -= $lastTagPos;
|
|
}
|
|
else
|
|
{
|
|
$this->eoln_pos = 0;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Sets charset
|
|
*/
|
|
protected function get_charset()
|
|
{
|
|
// if charset is missng set utf8
|
|
if (!$this->find_tag_and_trim("charset="))
|
|
{
|
|
$this->charset = "utf-8";
|
|
}
|
|
else
|
|
{
|
|
// try to find "
|
|
if (($quotesPos = strpos($this->buffer, '"')) === false)
|
|
{
|
|
if (($quotesPos = strpos($this->buffer, "'")) === false)
|
|
{
|
|
die("Can't find the quotes after 'charset=...'");
|
|
}
|
|
}
|
|
$this->charset = substr($this->buffer, 0, $quotesPos);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse method
|
|
*
|
|
* @param string $url
|
|
*/
|
|
abstract function parse($url);
|
|
}
|