Projekt

Obecné

Profil

« Předchozí | Další » 

Revize 126

Přidáno uživatelem Tomáš Dulík před asi 16 roky(ů)

Přidány nové knihovny pro parsování výpisů z Raiffeiesen eBanky - Parser_Ebanka.php a Parser_Html_Table.php. Příklad použití:

require_once("c:/www/freenetis/kohana/application/libraries/Parser_Ebanka.php");
Main::start();

class Main {
private static $linenr=0;

static function printdata($data) {
global $linenr;
$linenr++;
echo "<tr>\n";
foreach ($data as $d)
echo " <td>$d</td>\n";
echo "</tr>\n";
}
static function start() {
echo "<table border=1>\n";
$parser=new Parser_Ebanka();
$parser->set_callback(array("Main", "printdata"));
$parser->parse("c:/txt/unart/ucto/vypisy/2007/09.html");
//$parser->parse("http://www.rb.cz/firemni-finance/transparentni-ucty/?root=firemni-finance&item1=transparentni-ucty&tr_acc=vypis&account_number=184932848");
// $parser->parse("c:/txt/unart/ucto/vypisy/2007/cast2008.htm");
// $parser->parse("http://localhost/2007.html");
//$parser->parse("c:/txt/unart/ucto/vypisy/2007/2007.html");
echo "</table>\n";
echo "Linenr=".self::$linenr."<p>";
}
}

Zobrazit rozdíly:

freenetis/trunk/kohana/application/libraries/Parser_Ebanka.php
<?php
require_once("Parser_Html_Table.php");
/*
ALTER TABLE `money_transfer_bank_infos`
ADD COLUMN `comment` VARCHAR(255) after `date_time`;
ALTER TABLE `accounts`
ADD COLUMN `number` VARCHAR(255) after `name`;
* */
/**
* @author Tomas <Dulik at unart dot cz>
* @version 1.0
* Parser_Ebanka is a parser for getting data from bank account transaction listing
* in the HTML format used by the Czech bank called "Ebanka" (now Raiffeisen Bank).
*
* The parsing is a bit peculiar, because Ebanka uses different format for
* listings that are visible to general public (the "transparent" listing used
* by NGOV non-profit organizations) and different format for regular listing used in the
* ebanking application.
* This parser autodetects these two formats and parses the data according to it.
*
* Benchmarks:
* Machine: Notebook FSC Lifebook S7110, CPU: Pentium T2400 @ 1.8 GHz
* Win XP SP2, Apache 2.2.3, PHP 5.2.0
* Regular listing with 136 table rows (1 week listing): time=0.1 sec, memory=205 kB
* Regular listing with 2175 table rows (whole year listing): time=1.6 sec, memory=205 kB
* Transparent listing with 467 table rows: time=0.14 sec, memory=122 kB
* */
class Parser_Ebanka extends Parser_Html_Table {
const START_STRING="Pohyby na"; //Poslední řetězec před začátkem hlavní <TABLE>
const YEAR_STRING="za obdob"; //U běžných (netransparentních) výpisů
protected $year=false; //rok výpisu
/**
* $callback obsahuje jméno funkce, která se má zavolat poté, co naparsujeme
* 1 kompletní řádek HTML tabulky. Více viz metoda
* @see set_callback
* @var string|mixed
*/
protected $callback; //callback funkce, kterou můžeme volat např. pro uložení každého řádku výpisu do DB
protected $result; //parametr callback funkce, ve kterém je 1 řádek výsledku
/**
* funkce get_year(..) slouží k tomu, aby z bufferu, který začíná
* za řetězcem "Za obdob", např. buffer="í 1.12.2007/31.12.2007</td>
* vytáhla rok výpisu - v tomto příkladě "2007"
*/
protected function get_year() {
while (($slashPos=strpos($this->buffer, "/"))===false &&
$this->get_line()); // hledej lomítko
if ($slashPos===false) die("Nemůžu najít znak '/' v řetězci 'Za období ...'");
else {
$toSlash=substr($this->buffer, 1, $slashPos-1);
// hledej 1. tečku zpět od lomítka:
if (($dotPos=strrpos($toSlash, "."))===false)
die("Nemůžu najít znak '.' v řetězci 'Za období ...'");
}
$this->year=substr($toSlash, $dotPos+1); // získej rok výpisu
}
protected function get_castka($field) {
$field=strip_tags($field);
$field=str_replace(array(" ", " "), "", $field);
return strtr($field, ",", ".");
}
protected function get_data_from_transparent() {
$res = $this->result;
$first=true;
do {
$status=$this->get_table_rows();
$nr=count($this->matches[1]);
$fields=str_replace(array("\r", "\n", "\t"), "", $this->matches[1]);
if ($first) {
$i=6;
$first=false;
} else $i=0;
for (; $i<$nr; $i++) {
$field_nr=$i % 6;
$field=$fields[$i];
switch ($field_nr) {
case 0: // příklad: 31.08.2008<br/>06:11
$arr=explode("<br/>", $field);
$arrDate=explode(".", $arr[0]);
$res->date_time=$arrDate[2]."-".$arrDate[1]."-".$arrDate[0]." ".$arr[1];
break;
case 1: // Poznámky<br/>Název účtu plátce
$field=html_entity_decode($field,ENT_QUOTES,"UTF-8");
$arr=explode("<br/>", $field);
$res->comment=$arr[0];
$res->name=$arr[1];
break;
case 2: //2x za sebou datum odepsání<br/>typ platby
$arr=explode("<br/>", $field);
$res->typ=html_entity_decode($arr[2],ENT_QUOTES,"UTF-8");
break;
case 3:
$arr=explode("<br/>", $field); //VS<br/>KS<br/>SS
$res->variable_symbol=$arr[0];
$res->constant_symbol=$arr[1];
$res->specific_symbol=$arr[2];
break;
case 4:
$res->castka=$this->get_castka($field); // částka
break;
case 5:
$res->poplatek=$this->get_castka($field); // Poplatek
if (isset($this->callback)) call_user_func($this->callback, $res);
/**
* ted uz muzeme ulozit ziskane data do databaze:
*/
break;
} // switch
} // for
} while ( $status!== false );
}
protected function get_data_from_regular() {
$res = $this->result;
$first=true;
do {
$status=$this->get_table_rows();
$nr=count($this->matches[1]);
$fields=str_replace(array("\r", "\n", "\t"), "", $this->matches[1]);
if ($first) {
$i=7;
$first=false;
} else $i=0;
for (; $i<$nr; $i++) {
$field_nr=$i % 7;
$field=$fields[$i];
$field=html_entity_decode($field,ENT_QUOTES,"UTF-8"); // odstraneni &nbsp;
$field=str_replace(" ", "", $field);
switch ($field_nr) {
case 0: // číslo výpisu, každý měsíc od 1 do N
$res->cislo=$field;
break;
case 1: // datum a čas příklad: 08.08.<br>06:11
$arr=preg_split("/<br>/si", $field);
$arrDate=explode(".", $arr[0]);
$res->date_time=$this->year."-".$arrDate[1]."-".$arrDate[0]." ".$arr[1];
break;
case 2: // Poznámky<br>Název účtu a<br>číslo účtu plátce
$arr=preg_split("/<br>/si", $field); // dělelní dle <BR> nebo <br>
$res->comment=$arr[0]; // odstranění počát. a konc. mezer
$res->name=$arr[1];
$res->cislo_uctu=$arr[2];
break;
case 3: //datum odepsání<br><br>typ platby
$arr=preg_split("/<br>/si", $field);
$res->typ=$arr[2];
break;
case 4: //SS<br>VS<br>KS
$arr=preg_split("/<br>/si", $field);
$res->variable_symbol=$arr[1];
$res->constant_symbol=$arr[2];
$res->specific_symbol=$arr[0];
break;
case 5: // částka
$res->castka=$this->get_castka($field);
break;
case 6: // poplatek
$res->poplatek=$this->get_castka($field);
if (isset($this->callback)) call_user_func($this->callback, $res);
/**
* ted uz muzeme ulozit ziskane data do databaze:
*/
break;
}
}
} while ( $status!== false );
}
/**
* set_callback umožňuje nastavit funkci, která se bude volat po naparsování
* každého řádku HTML tabulky.
* @param string|mixed $function Jméno callback funkce jako string. Pokud
* je funkce v nějaké třídě, pak jako parametr předejte pole array(trida,jmeno_fce)
*/
public function set_callback($function, $param=NULL) {
$this->callback=$function;
if (!isset($param)) $this->result=new stdClass();
else $this->result=$param;
}
public function parse($url) {
$this->open($url);
$this->get_charset();
/**
* Now: search for the begining of the table or the date
*/
$found=$this->find_tags_and_trim(array(self::YEAR_STRING, self::START_STRING));
switch ($found) {
case 0: // období výpisu nalezeno = standardní (netransparentní) výpis
$transparent=false;
$this->get_year();
if (!$this->find_tag_and_trim(self::START_STRING))
die ("Nemohu najít začátek tabulky: '".self::START_STRING."'");
break;
case 1: //období výpisu nenalezeno - transparentní výpis
$transparent=true;
$this->year=date("Y");
break;
case 2:
die ("Nemohu najít začátek tabulky nebo datum/rok");
};
if ($transparent)
$this->get_data_from_transparent();
else
$this->get_data_from_regular();
fclose($this->file);
}
}
?>
freenetis/trunk/kohana/application/libraries/Parser_Html_Table.php
<?php
/**
* @author Tomas <Dulik at unart dot cz>
* @version 1.0
* Parser_Html_Table is ABSTRACT class containing methods useful
* for parsing HTML tables in generic HTML files.
*
* Motivation: we want to parse HTML tables to get interesting data from various web sites.
* The HTML code of the tables often does not conforms to XML/XHTML rules.
* It often does not conform even HTML4, e.g. - the table row is not closed by </tr>,
* table cell is not closed by </td> etc.
* Therefore, XML parsers can't be used for this.
* The Tidy extension is not available on all hostings.
* If you think about parsing a non-XHTML non-HTML4.0 table, look at this class.
* The methods have been optimized to give maximum possible performance
* and memory efficiency.
* For an example how to use this class, see the Parser_Ebanka class
*/
abstract class Parser_Html_Table {
const TIMEOUT=3;
protected $file;
protected $charset;
protected $buffer;
protected $eoln_pos; // the position of the last End Of Line in the buffer
protected $matches;
public function open($url) {
if ($url!="") {
$old = ini_set('default_socket_timeout', self::TIMEOUT);
if (($this->file=fopen($url, "rb"))===false)
die ("Can not open file! Check if $url exists!");
ini_set('default_socket_timeout', $old);
stream_set_timeout($this->file, self::TIMEOUT);
//stream_set_blocking($this->file, 0);
}
}
/**
* get_line appends **AT LEAST** one line from the $file into the $buffer.
*
*
* @return boolean
* @uses buffer, eoln_pos;
* In PHP4, this is MUCH faster than using fgets because of a PHP bug.
* In PHP5, this is usualy still faster than the following version based on fgets:
*
* protected function get_line_fgets() {
* if (!feof($this->file))
* $this->buffer .= fgets($this->file);
* else return false;
* $this->eoln_pos=strlen($this->buffer);
* return true;
* }
*
* Note for HTML files with super long lines (hundreds of kbytes without single
* EOLN) the fgets would be useless - it'd take a lot of memory to read a single line!
* For such files, you should modify the code of my function this way:
* Replace
* ...eoln_pos=strripos($this->buffer,"\n"))
* by something like
* ...eoln_pos=find_row_end()
*/
public function get_line() {
while (!feof($this->file)) {
$new_part = fread($this->file, 8192);
$this->buffer .= $new_part; // read 8192 bytes from file or one packet
if (($this->eoln_pos=strripos($this->buffer,"\n"))!==false) // search eoln from end: found ?
return true; // eoln found! done, OK...
}
// EOF happened ?
if (!isset($new_part)) return false; // EOF right when the function begun? Return EOF!
// EOF happened but no EOLN
$this->eoln_pos=strlen($this->buffer); // set eoln_pos to EOF...
return true;
}
/**
* find_tag_and_trim($tag) tries to find the tag in the $this->buffer
* and trim the beginning of the buffer till (and including) the $tag
* returns false if string not found.
* returns true if string found, and the variable $this->buffer contains
* string trimmed from the first occurence of $tag
*/
protected function find_tag_and_trim($tag) {
$found=false;
do {
if (($pos=stripos($this->buffer, $tag))!==false) { // can you find the startag ?
$found=true; // yes!
$pos+=strlen($tag); // set the cut $pos(ition) behind $tag
} else $pos=$this->eoln_pos; // no? Then set the cut $pos(ition) to the
// last eoln in buffer, so we don't have to deal with these lines again
// The last "non-terminated-yet" line (line without EOLN) must stay in the buffer!
$this->buffer=substr($this->buffer, $pos); // now cut away everything from the beginning till the cut position
// and update the counters
$this->eoln_pos -= $pos;
} while ( !$found && $this->get_line() );
return $found;
}
/**
* the same as previous function, but for multiple tags search.
* if tag is found, returns the tag index in the $tags array.
* if tag is not found, returns number of $tags+1
*/
protected function find_tags_and_trim($tags) {
$found=false;
do {
$i=0;
foreach ($tags as $tag) // for all the tags do:
if (($pos=stripos($this->buffer, $tag))!==false) { // can you find the startag ?
$found=true; // yes!
$pos+=strlen($tag); // set the cut $pos(ition) behind $tag
break;
} else $i++; // this tag not found - increment cntr and try another one
if (!$found) // no tag found at all?
$pos=$this->eoln_pos; // Then set the cut $pos(ition) to the
// last eoln in buffer, so we don't have to deal with these lines again
// The last "non-terminated-yet" line (line without EOLN) must stay in the buffer!
$this->buffer=substr($this->buffer, $pos); // now cut away everything from the beginning till the cut position
$this->eoln_pos -= $pos; // and update the counters
} while ( !$found && $this->get_line() );
return $i;
}
/**
* this functions tries to find the end of table row.
* It can handle even rows terminated incorrectly by
* </table> instead of </tr>
* Returns: the position of the end row tag (</tr> or </table>)
* or false if the tag is not found.
*/
protected function find_row_end() {
/**
* PHP5 version: in PHP5, strripos can search whole string,
* not only 1 char as in PHP4
*/
if ( ($res=strripos($this->buffer, "</tr")) !== false)
return $res;
else if ( ($res=strripos($this->buffer, "<tr")))
return $res;
else return strripos($this->buffer, "</table");
/**
* PHP4 version: we have to use perl regular expressions...
* This is only 0.03sec/100kB slower than PHP5 strripos version
*/
/*
$matchCnt=preg_match("/<[\/]?(?:tr|table)(?!.*<[\/]?(tr|table))/si",$this->buffer, $matches, PREG_OFFSET_CAPTURE);
if ($matchCnt==1) return $matches[0][1];
else return false;
*/
}
/**
* get_table_rows tries to fill the buffer with at least one table row (<tr>...<[/]tr>) string.
* It then parses the rows using a regular expression, which returns the content of the
* table cells in the $this->matches array
* Because fread reads whole blocks, it is possible this
*/
protected function get_table_rows() {
/* Try to find the starting <tr> tag: */
if (! $this->find_tag_and_trim("<tr") ) return false;
/** now try to find the last <[/]tr> or <[/]table> tag by searching these tags not followed by the same tags,
if not successfull, read the next line of the file */
while ( ($lastTagPos=$this->find_row_end()) === false
&& $this->get_line());
if ($lastTagPos===false) return false; // if <tr> not found untill EOF, return EOF
$rows=substr($this->buffer,0, $lastTagPos); // $rows is string containing several <tr>...<tr>... ended by <tr>
if (strcasecmp($this->charset, "utf-8")!=0) // if HTML charset is not UTF-8
$rows=iconv($this->charset, "UTF-8", $rows); // convert it to UTF-8
/**
* Now: get the contents of all the table cells (the texts between
* <td > and <td > or </td> or <tr> or </tr> tags
*/
preg_match_all("/<td[^>]*>(?:<[^>]*>)?(.*?)<(?:(?:\/)?td|tr)/si", $rows, $this->matches);
$this->buffer=substr($this->buffer,$lastTagPos);
return true;
}
protected function get_charset() {
if (! $this->find_tag_and_trim("charset=")) $this->charset="utf-8"; // pokud charset chybí, nastav utf8
else {
if (($quotesPos=strpos($this->buffer, '"'))===false) // zkus najít uvozovky
if (($quotesPos=strpos($this->buffer, "'"))===false)
die("Can't find the quotes after 'charset=...'");
$this->charset = substr($this->buffer, 0, $quotesPos);
}
}
abstract function parse($url);
}
?>

Také k dispozici: Unified diff