Revize 126
Přidáno uživatelem Tomáš Dulík před asi 16 roky(ů)
freenetis/trunk/kohana/application/libraries/Parser_Ebanka.php | ||
---|---|---|
<?php
|
||
require_once("Parser_Html_Table.php");
|
||
/*
|
||
ALTER TABLE `money_transfer_bank_infos`
|
||
ADD COLUMN `comment` VARCHAR(255) after `date_time`;
|
||
|
||
ALTER TABLE `accounts`
|
||
ADD COLUMN `number` VARCHAR(255) after `name`;
|
||
|
||
|
||
* */
|
||
|
||
/**
|
||
* @author Tomas <Dulik at unart dot cz>
|
||
* @version 1.0
|
||
* Parser_Ebanka is a parser for getting data from bank account transaction listing
|
||
* in the HTML format used by the Czech bank called "Ebanka" (now Raiffeisen Bank).
|
||
*
|
||
* The parsing is a bit peculiar, because Ebanka uses different format for
|
||
* listings that are visible to general public (the "transparent" listing used
|
||
* by NGOV non-profit organizations) and different format for regular listing used in the
|
||
* ebanking application.
|
||
* This parser autodetects these two formats and parses the data according to it.
|
||
*
|
||
* Benchmarks:
|
||
* Machine: Notebook FSC Lifebook S7110, CPU: Pentium T2400 @ 1.8 GHz
|
||
* Win XP SP2, Apache 2.2.3, PHP 5.2.0
|
||
* Regular listing with 136 table rows (1 week listing): time=0.1 sec, memory=205 kB
|
||
* Regular listing with 2175 table rows (whole year listing): time=1.6 sec, memory=205 kB
|
||
* Transparent listing with 467 table rows: time=0.14 sec, memory=122 kB
|
||
* */
|
||
class Parser_Ebanka extends Parser_Html_Table {
|
||
const START_STRING="Pohyby na"; //Poslední řetězec před začátkem hlavní <TABLE>
|
||
const YEAR_STRING="za obdob"; //U běžných (netransparentních) výpisů
|
||
protected $year=false; //rok výpisu
|
||
/**
|
||
* $callback obsahuje jméno funkce, která se má zavolat poté, co naparsujeme
|
||
* 1 kompletní řádek HTML tabulky. Více viz metoda
|
||
* @see set_callback
|
||
* @var string|mixed
|
||
*/
|
||
protected $callback; //callback funkce, kterou můžeme volat např. pro uložení každého řádku výpisu do DB
|
||
protected $result; //parametr callback funkce, ve kterém je 1 řádek výsledku
|
||
/**
|
||
* funkce get_year(..) slouží k tomu, aby z bufferu, který začíná
|
||
* za řetězcem "Za obdob", např. buffer="í 1.12.2007/31.12.2007</td>
|
||
* vytáhla rok výpisu - v tomto příkladě "2007"
|
||
*/
|
||
protected function get_year() {
|
||
while (($slashPos=strpos($this->buffer, "/"))===false &&
|
||
$this->get_line()); // hledej lomítko
|
||
if ($slashPos===false) die("Nemůžu najít znak '/' v řetězci 'Za období ...'");
|
||
else {
|
||
$toSlash=substr($this->buffer, 1, $slashPos-1);
|
||
// hledej 1. tečku zpět od lomítka:
|
||
if (($dotPos=strrpos($toSlash, "."))===false)
|
||
die("Nemůžu najít znak '.' v řetězci 'Za období ...'");
|
||
}
|
||
$this->year=substr($toSlash, $dotPos+1); // získej rok výpisu
|
||
}
|
||
|
||
protected function get_castka($field) {
|
||
$field=strip_tags($field);
|
||
$field=str_replace(array(" ", " "), "", $field);
|
||
return strtr($field, ",", ".");
|
||
}
|
||
|
||
protected function get_data_from_transparent() {
|
||
$res = $this->result;
|
||
$first=true;
|
||
do {
|
||
$status=$this->get_table_rows();
|
||
$nr=count($this->matches[1]);
|
||
$fields=str_replace(array("\r", "\n", "\t"), "", $this->matches[1]);
|
||
if ($first) {
|
||
$i=6;
|
||
$first=false;
|
||
} else $i=0;
|
||
for (; $i<$nr; $i++) {
|
||
$field_nr=$i % 6;
|
||
$field=$fields[$i];
|
||
switch ($field_nr) {
|
||
case 0: // příklad: 31.08.2008<br/>06:11
|
||
$arr=explode("<br/>", $field);
|
||
$arrDate=explode(".", $arr[0]);
|
||
$res->date_time=$arrDate[2]."-".$arrDate[1]."-".$arrDate[0]." ".$arr[1];
|
||
break;
|
||
case 1: // Poznámky<br/>Název účtu plátce
|
||
$field=html_entity_decode($field,ENT_QUOTES,"UTF-8");
|
||
$arr=explode("<br/>", $field);
|
||
$res->comment=$arr[0];
|
||
$res->name=$arr[1];
|
||
break;
|
||
case 2: //2x za sebou datum odepsání<br/>typ platby
|
||
$arr=explode("<br/>", $field);
|
||
$res->typ=html_entity_decode($arr[2],ENT_QUOTES,"UTF-8");
|
||
break;
|
||
case 3:
|
||
$arr=explode("<br/>", $field); //VS<br/>KS<br/>SS
|
||
$res->variable_symbol=$arr[0];
|
||
$res->constant_symbol=$arr[1];
|
||
$res->specific_symbol=$arr[2];
|
||
break;
|
||
case 4:
|
||
$res->castka=$this->get_castka($field); // částka
|
||
break;
|
||
case 5:
|
||
$res->poplatek=$this->get_castka($field); // Poplatek
|
||
if (isset($this->callback)) call_user_func($this->callback, $res);
|
||
|
||
/**
|
||
* ted uz muzeme ulozit ziskane data do databaze:
|
||
*/
|
||
break;
|
||
} // switch
|
||
} // for
|
||
} while ( $status!== false );
|
||
}
|
||
|
||
protected function get_data_from_regular() {
|
||
$res = $this->result;
|
||
$first=true;
|
||
do {
|
||
$status=$this->get_table_rows();
|
||
|
||
$nr=count($this->matches[1]);
|
||
$fields=str_replace(array("\r", "\n", "\t"), "", $this->matches[1]);
|
||
|
||
if ($first) {
|
||
$i=7;
|
||
$first=false;
|
||
} else $i=0;
|
||
|
||
for (; $i<$nr; $i++) {
|
||
$field_nr=$i % 7;
|
||
$field=$fields[$i];
|
||
$field=html_entity_decode($field,ENT_QUOTES,"UTF-8"); // odstraneni
|
||
$field=str_replace(" ", "", $field);
|
||
|
||
switch ($field_nr) {
|
||
case 0: // číslo výpisu, každý měsíc od 1 do N
|
||
$res->cislo=$field;
|
||
break;
|
||
case 1: // datum a čas příklad: 08.08.<br>06:11
|
||
$arr=preg_split("/<br>/si", $field);
|
||
$arrDate=explode(".", $arr[0]);
|
||
$res->date_time=$this->year."-".$arrDate[1]."-".$arrDate[0]." ".$arr[1];
|
||
break;
|
||
case 2: // Poznámky<br>Název účtu a<br>číslo účtu plátce
|
||
$arr=preg_split("/<br>/si", $field); // dělelní dle <BR> nebo <br>
|
||
$res->comment=$arr[0]; // odstranění počát. a konc. mezer
|
||
$res->name=$arr[1];
|
||
$res->cislo_uctu=$arr[2];
|
||
break;
|
||
case 3: //datum odepsání<br><br>typ platby
|
||
$arr=preg_split("/<br>/si", $field);
|
||
$res->typ=$arr[2];
|
||
break;
|
||
case 4: //SS<br>VS<br>KS
|
||
$arr=preg_split("/<br>/si", $field);
|
||
$res->variable_symbol=$arr[1];
|
||
$res->constant_symbol=$arr[2];
|
||
$res->specific_symbol=$arr[0];
|
||
break;
|
||
case 5: // částka
|
||
$res->castka=$this->get_castka($field);
|
||
break;
|
||
case 6: // poplatek
|
||
$res->poplatek=$this->get_castka($field);
|
||
if (isset($this->callback)) call_user_func($this->callback, $res);
|
||
/**
|
||
* ted uz muzeme ulozit ziskane data do databaze:
|
||
*/
|
||
break;
|
||
}
|
||
}
|
||
} while ( $status!== false );
|
||
}
|
||
|
||
/**
|
||
* set_callback umožňuje nastavit funkci, která se bude volat po naparsování
|
||
* každého řádku HTML tabulky.
|
||
* @param string|mixed $function Jméno callback funkce jako string. Pokud
|
||
* je funkce v nějaké třídě, pak jako parametr předejte pole array(trida,jmeno_fce)
|
||
*/
|
||
public function set_callback($function, $param=NULL) {
|
||
$this->callback=$function;
|
||
if (!isset($param)) $this->result=new stdClass();
|
||
else $this->result=$param;
|
||
}
|
||
|
||
public function parse($url) {
|
||
$this->open($url);
|
||
$this->get_charset();
|
||
/**
|
||
* Now: search for the begining of the table or the date
|
||
*/
|
||
$found=$this->find_tags_and_trim(array(self::YEAR_STRING, self::START_STRING));
|
||
switch ($found) {
|
||
case 0: // období výpisu nalezeno = standardní (netransparentní) výpis
|
||
$transparent=false;
|
||
$this->get_year();
|
||
if (!$this->find_tag_and_trim(self::START_STRING))
|
||
die ("Nemohu najít začátek tabulky: '".self::START_STRING."'");
|
||
break;
|
||
case 1: //období výpisu nenalezeno - transparentní výpis
|
||
$transparent=true;
|
||
$this->year=date("Y");
|
||
break;
|
||
case 2:
|
||
die ("Nemohu najít začátek tabulky nebo datum/rok");
|
||
};
|
||
|
||
if ($transparent)
|
||
$this->get_data_from_transparent();
|
||
else
|
||
$this->get_data_from_regular();
|
||
|
||
fclose($this->file);
|
||
}
|
||
}
|
||
|
||
?>
|
freenetis/trunk/kohana/application/libraries/Parser_Html_Table.php | ||
---|---|---|
<?php
|
||
/**
|
||
* @author Tomas <Dulik at unart dot cz>
|
||
* @version 1.0
|
||
|
||
* Parser_Html_Table is ABSTRACT class containing methods useful
|
||
* for parsing HTML tables in generic HTML files.
|
||
*
|
||
* Motivation: we want to parse HTML tables to get interesting data from various web sites.
|
||
* The HTML code of the tables often does not conforms to XML/XHTML rules.
|
||
* It often does not conform even HTML4, e.g. - the table row is not closed by </tr>,
|
||
* table cell is not closed by </td> etc.
|
||
* Therefore, XML parsers can't be used for this.
|
||
* The Tidy extension is not available on all hostings.
|
||
* If you think about parsing a non-XHTML non-HTML4.0 table, look at this class.
|
||
* The methods have been optimized to give maximum possible performance
|
||
* and memory efficiency.
|
||
* For an example how to use this class, see the Parser_Ebanka class
|
||
*/
|
||
abstract class Parser_Html_Table {
|
||
const TIMEOUT=3;
|
||
|
||
protected $file;
|
||
protected $charset;
|
||
protected $buffer;
|
||
protected $eoln_pos; // the position of the last End Of Line in the buffer
|
||
protected $matches;
|
||
|
||
public function open($url) {
|
||
if ($url!="") {
|
||
$old = ini_set('default_socket_timeout', self::TIMEOUT);
|
||
if (($this->file=fopen($url, "rb"))===false)
|
||
die ("Can not open file! Check if $url exists!");
|
||
ini_set('default_socket_timeout', $old);
|
||
stream_set_timeout($this->file, self::TIMEOUT);
|
||
//stream_set_blocking($this->file, 0);
|
||
}
|
||
}
|
||
|
||
|
||
/**
|
||
* get_line appends **AT LEAST** one line from the $file into the $buffer.
|
||
*
|
||
*
|
||
* @return boolean
|
||
* @uses buffer, eoln_pos;
|
||
* In PHP4, this is MUCH faster than using fgets because of a PHP bug.
|
||
* In PHP5, this is usualy still faster than the following version based on fgets:
|
||
*
|
||
* protected function get_line_fgets() {
|
||
* if (!feof($this->file))
|
||
* $this->buffer .= fgets($this->file);
|
||
* else return false;
|
||
* $this->eoln_pos=strlen($this->buffer);
|
||
* return true;
|
||
* }
|
||
*
|
||
* Note for HTML files with super long lines (hundreds of kbytes without single
|
||
* EOLN) the fgets would be useless - it'd take a lot of memory to read a single line!
|
||
* For such files, you should modify the code of my function this way:
|
||
* Replace
|
||
* ...eoln_pos=strripos($this->buffer,"\n"))
|
||
* by something like
|
||
* ...eoln_pos=find_row_end()
|
||
*/
|
||
public function get_line() {
|
||
while (!feof($this->file)) {
|
||
$new_part = fread($this->file, 8192);
|
||
$this->buffer .= $new_part; // read 8192 bytes from file or one packet
|
||
if (($this->eoln_pos=strripos($this->buffer,"\n"))!==false) // search eoln from end: found ?
|
||
return true; // eoln found! done, OK...
|
||
}
|
||
// EOF happened ?
|
||
if (!isset($new_part)) return false; // EOF right when the function begun? Return EOF!
|
||
// EOF happened but no EOLN
|
||
$this->eoln_pos=strlen($this->buffer); // set eoln_pos to EOF...
|
||
return true;
|
||
}
|
||
|
||
|
||
/**
|
||
* find_tag_and_trim($tag) tries to find the tag in the $this->buffer
|
||
* and trim the beginning of the buffer till (and including) the $tag
|
||
* returns false if string not found.
|
||
* returns true if string found, and the variable $this->buffer contains
|
||
* string trimmed from the first occurence of $tag
|
||
*/
|
||
protected function find_tag_and_trim($tag) {
|
||
$found=false;
|
||
do {
|
||
if (($pos=stripos($this->buffer, $tag))!==false) { // can you find the startag ?
|
||
$found=true; // yes!
|
||
$pos+=strlen($tag); // set the cut $pos(ition) behind $tag
|
||
} else $pos=$this->eoln_pos; // no? Then set the cut $pos(ition) to the
|
||
// last eoln in buffer, so we don't have to deal with these lines again
|
||
// The last "non-terminated-yet" line (line without EOLN) must stay in the buffer!
|
||
$this->buffer=substr($this->buffer, $pos); // now cut away everything from the beginning till the cut position
|
||
// and update the counters
|
||
$this->eoln_pos -= $pos;
|
||
|
||
} while ( !$found && $this->get_line() );
|
||
|
||
return $found;
|
||
}
|
||
/**
|
||
* the same as previous function, but for multiple tags search.
|
||
* if tag is found, returns the tag index in the $tags array.
|
||
* if tag is not found, returns number of $tags+1
|
||
*/
|
||
protected function find_tags_and_trim($tags) {
|
||
$found=false;
|
||
do {
|
||
$i=0;
|
||
foreach ($tags as $tag) // for all the tags do:
|
||
if (($pos=stripos($this->buffer, $tag))!==false) { // can you find the startag ?
|
||
$found=true; // yes!
|
||
$pos+=strlen($tag); // set the cut $pos(ition) behind $tag
|
||
break;
|
||
} else $i++; // this tag not found - increment cntr and try another one
|
||
|
||
if (!$found) // no tag found at all?
|
||
$pos=$this->eoln_pos; // Then set the cut $pos(ition) to the
|
||
// last eoln in buffer, so we don't have to deal with these lines again
|
||
// The last "non-terminated-yet" line (line without EOLN) must stay in the buffer!
|
||
$this->buffer=substr($this->buffer, $pos); // now cut away everything from the beginning till the cut position
|
||
$this->eoln_pos -= $pos; // and update the counters
|
||
} while ( !$found && $this->get_line() );
|
||
|
||
return $i;
|
||
}
|
||
/**
|
||
* this functions tries to find the end of table row.
|
||
* It can handle even rows terminated incorrectly by
|
||
* </table> instead of </tr>
|
||
* Returns: the position of the end row tag (</tr> or </table>)
|
||
* or false if the tag is not found.
|
||
*/
|
||
protected function find_row_end() {
|
||
/**
|
||
* PHP5 version: in PHP5, strripos can search whole string,
|
||
* not only 1 char as in PHP4
|
||
*/
|
||
if ( ($res=strripos($this->buffer, "</tr")) !== false)
|
||
return $res;
|
||
else if ( ($res=strripos($this->buffer, "<tr")))
|
||
return $res;
|
||
else return strripos($this->buffer, "</table");
|
||
/**
|
||
* PHP4 version: we have to use perl regular expressions...
|
||
* This is only 0.03sec/100kB slower than PHP5 strripos version
|
||
*/
|
||
/*
|
||
$matchCnt=preg_match("/<[\/]?(?:tr|table)(?!.*<[\/]?(tr|table))/si",$this->buffer, $matches, PREG_OFFSET_CAPTURE);
|
||
if ($matchCnt==1) return $matches[0][1];
|
||
else return false;
|
||
*/
|
||
|
||
|
||
}
|
||
|
||
/**
|
||
* get_table_rows tries to fill the buffer with at least one table row (<tr>...<[/]tr>) string.
|
||
* It then parses the rows using a regular expression, which returns the content of the
|
||
* table cells in the $this->matches array
|
||
* Because fread reads whole blocks, it is possible this
|
||
*/
|
||
protected function get_table_rows() {
|
||
/* Try to find the starting <tr> tag: */
|
||
if (! $this->find_tag_and_trim("<tr") ) return false;
|
||
|
||
|
||
/** now try to find the last <[/]tr> or <[/]table> tag by searching these tags not followed by the same tags,
|
||
if not successfull, read the next line of the file */
|
||
while ( ($lastTagPos=$this->find_row_end()) === false
|
||
&& $this->get_line());
|
||
|
||
if ($lastTagPos===false) return false; // if <tr> not found untill EOF, return EOF
|
||
|
||
$rows=substr($this->buffer,0, $lastTagPos); // $rows is string containing several <tr>...<tr>... ended by <tr>
|
||
if (strcasecmp($this->charset, "utf-8")!=0) // if HTML charset is not UTF-8
|
||
$rows=iconv($this->charset, "UTF-8", $rows); // convert it to UTF-8
|
||
/**
|
||
* Now: get the contents of all the table cells (the texts between
|
||
* <td > and <td > or </td> or <tr> or </tr> tags
|
||
*/
|
||
preg_match_all("/<td[^>]*>(?:<[^>]*>)?(.*?)<(?:(?:\/)?td|tr)/si", $rows, $this->matches);
|
||
$this->buffer=substr($this->buffer,$lastTagPos);
|
||
return true;
|
||
}
|
||
|
||
protected function get_charset() {
|
||
if (! $this->find_tag_and_trim("charset=")) $this->charset="utf-8"; // pokud charset chybí, nastav utf8
|
||
else {
|
||
if (($quotesPos=strpos($this->buffer, '"'))===false) // zkus najít uvozovky
|
||
if (($quotesPos=strpos($this->buffer, "'"))===false)
|
||
die("Can't find the quotes after 'charset=...'");
|
||
$this->charset = substr($this->buffer, 0, $quotesPos);
|
||
}
|
||
}
|
||
abstract function parse($url);
|
||
}
|
||
?>
|
Také k dispozici: Unified diff
Přidány nové knihovny pro parsování výpisů z Raiffeiesen eBanky - Parser_Ebanka.php a Parser_Html_Table.php. Příklad použití:
require_once("c:/www/freenetis/kohana/application/libraries/Parser_Ebanka.php");
Main::start();
class Main {
private static $linenr=0;