/ - Diff - FreenetIS - Redmine

« Předchozí | Další »

Revize 126

Přidáno uživatelem Tomáš Dulík před téměř 16 roky(ů)

Přidány nové knihovny pro parsování výpisů z Raiffeiesen eBanky - Parser_Ebanka.php a Parser_Html_Table.php. Příklad použití:

require_once("c:/www/freenetis/kohana/application/libraries/Parser_Ebanka.php");
Main::start();

class Main {
private static $linenr=0;

static function printdata($data) {
        global $linenr;
        $linenr++;
        echo "&lt;tr&gt;\n";
        foreach ($data as $d)
        echo "    &lt;td&gt;$d&lt;/td&gt;\n";
        echo "&lt;/tr&gt;\n";
    }

static function start() {
        echo "&lt;table border=1&gt;\n";
        $parser=new Parser_Ebanka();
        $parser->set_callback(array("Main", "printdata"));
        $parser->parse("c:/txt/unart/ucto/vypisy/2007/09.html");
        //$parser->parse("http://www.rb.cz/firemni-finance/transparentni-ucty/?root=firemni-finance&item1=transparentni-ucty&tr_acc=vypis&account_number=184932848");
        // $parser->parse("c:/txt/unart/ucto/vypisy/2007/cast2008.htm");
        // $parser->parse("http://localhost/2007.html");
        //$parser->parse("c:/txt/unart/ucto/vypisy/2007/2007.html");

echo "&lt;/table&gt;\n";
        echo "Linenr=".self::$linenr."&lt;p&gt;";
    }
}

     <?php
     require_once("Parser_Html_Table.php");
     /*
     ALTER TABLE `money_transfer_bank_infos`
     ADD COLUMN `comment` VARCHAR(255) after `date_time`;
     ALTER TABLE `accounts`
     ADD COLUMN `number` VARCHAR(255) after `name`;
      *  */
     /**
      * @author Tomas <Dulik at unart dot cz>
      * @version 1.0
      * Parser_Ebanka is a parser for getting data from bank account transaction listing
      * in the HTML format used by the Czech bank called "Ebanka" (now Raiffeisen Bank).
+     *
      * The parsing is a bit peculiar, because Ebanka uses different format for
      * listings that are visible to general public (the "transparent" listing used
      * by NGOV non-profit organizations) and different format for regular listing used in the
      * ebanking application.
      * This parser autodetects these two formats and parses the data according to it.
+     *
      * Benchmarks:
      * Machine: Notebook FSC Lifebook S7110, CPU: Pentium T2400 @ 1.8 GHz
      * Win XP SP2, Apache 2.2.3, PHP 5.2.0
      * Regular listing with 136 table rows (1 week listing): time=0.1 sec, memory=205 kB
      * Regular listing with 2175 table rows (whole year listing): time=1.6 sec, memory=205 kB
      * Transparent listing with 467 table rows: time=0.14 sec, memory=122 kB
      * */
     class Parser_Ebanka extends Parser_Html_Table {
     	const START_STRING="Pohyby na"; 	//Poslední řetězec před začátkem hlavní <TABLE>
     	const YEAR_STRING="za obdob";		//U běžných (netransparentních) výpisů
     	protected $year=false;				//rok výpisu
     	/**
     	 * $callback obsahuje jméno funkce, která se má zavolat poté, co naparsujeme
     	 * 1 kompletní řádek HTML tabulky. Více viz metoda
     	 * @see set_callback
     	 * @var string|mixed
     	 */
     	protected $callback;				//callback funkce, kterou můžeme volat např. pro uložení každého řádku výpisu do DB
     	protected $result; 					//parametr callback funkce, ve kterém je 1 řádek výsledku
     	/**
     	 * funkce get_year(..) slouží k tomu, aby z bufferu, který začíná
     	 * za řetězcem "Za obdob", např. buffer="í 1.12.2007/31.12.2007</td>
     	 * vytáhla rok výpisu - v tomto příkladě "2007"
     	 */
     	protected function get_year() {
     		while (($slashPos=strpos($this->buffer, "/"))===false &&
     				$this->get_line()); // hledej lomítko
     		if ($slashPos===false) die("Nemůžu najít znak '/' v řetězci 'Za období ...'");
     		else {
     			$toSlash=substr($this->buffer, 1, $slashPos-1);
     			// hledej 1. tečku zpět od lomítka:
     			if (($dotPos=strrpos($toSlash, "."))===false)
     				die("Nemůžu najít znak '.' v řetězci 'Za období ...'");
+    		}
     		$this->year=substr($toSlash, $dotPos+1);		// získej rok výpisu
+    	}
     	protected function get_castka($field) {
     		$field=strip_tags($field);
     		$field=str_replace(array(" ", " "), "", $field);
     		 return strtr($field, ",", ".");
+    	}
     	protected function get_data_from_transparent() {
     		$res = $this->result;
     		$first=true;
     		do {
     			$status=$this->get_table_rows();
     			$nr=count($this->matches[1]);
     			$fields=str_replace(array("\r", "\n", "\t"), "", $this->matches[1]);
     			if ($first) {
     				$i=6;
     				$first=false;
     			} else $i=0;
     			for (; $i<$nr; $i++) {
     				$field_nr=$i % 6;
     				$field=$fields[$i];
     				switch ($field_nr) {
     					case 0:			// příklad: 31.08.2008<br/>06:11
     						$arr=explode("<br/>", $field);
     						$arrDate=explode(".", $arr[0]);
     						$res->date_time=$arrDate[2]."-".$arrDate[1]."-".$arrDate[0]." ".$arr[1];
     						break;
     					case 1:			// Poznámky<br/>Název účtu plátce
     						$field=html_entity_decode($field,ENT_QUOTES,"UTF-8");
     						$arr=explode("<br/>", $field);
     						$res->comment=$arr[0];
     						$res->name=$arr[1];
     						break;
     					case 2:			//2x za sebou datum odepsání<br/>typ platby
     						$arr=explode("<br/>", $field);
     						$res->typ=html_entity_decode($arr[2],ENT_QUOTES,"UTF-8");
     						break;
     					case 3:
     						$arr=explode("<br/>", $field);	//VS<br/>KS<br/>SS
     						$res->variable_symbol=$arr[0];
     						$res->constant_symbol=$arr[1];
     						$res->specific_symbol=$arr[2];
     						break;
     					case 4:
     						$res->castka=$this->get_castka($field);	// částka
     						break;
     					case 5:
     						$res->poplatek=$this->get_castka($field); // Poplatek
     						if (isset($this->callback)) call_user_func($this->callback, $res);
     						/**
     						 * ted uz muzeme ulozit ziskane data do databaze:
     						 */
     						break;
     				}	// switch
     			}	// for
     		} while ( $status!== false );
+    	}
     	protected function get_data_from_regular() {
     		$res = $this->result;
     		$first=true;
     		do {
     			$status=$this->get_table_rows();
     			$nr=count($this->matches[1]);
     			$fields=str_replace(array("\r", "\n", "\t"), "", $this->matches[1]);
     			if ($first) {
     				$i=7;
     				$first=false;
     			} else $i=0;
     			for (; $i<$nr; $i++) {
     				$field_nr=$i % 7;
     				$field=$fields[$i];
     				$field=html_entity_decode($field,ENT_QUOTES,"UTF-8"); // odstraneni &nbsp;
     				$field=str_replace(" ", "", $field);
     				switch ($field_nr) {
     					case 0: 			// číslo výpisu, každý měsíc od 1 do N
     						$res->cislo=$field;
     						break;
     					case 1:			// datum a čas příklad: 08.08.<br>06:11
     						$arr=preg_split("/<br>/si", $field);
     						$arrDate=explode(".", $arr[0]);
     						$res->date_time=$this->year."-".$arrDate[1]."-".$arrDate[0]." ".$arr[1];
     						break;
     					case 2:			// Poznámky<br>Název účtu a<br>číslo účtu plátce
     						$arr=preg_split("/<br>/si", $field);				  // dělelní dle <BR> nebo <br>
     						$res->comment=$arr[0];						  // odstranění počát. a konc. mezer
     						$res->name=$arr[1];
     						$res->cislo_uctu=$arr[2];
     						break;
     					case 3:			//datum odepsání<br><br>typ platby
     						$arr=preg_split("/<br>/si", $field);
     						$res->typ=$arr[2];
     						break;
     					case 4:			//SS<br>VS<br>KS
     						$arr=preg_split("/<br>/si", $field);
     						$res->variable_symbol=$arr[1];
     						$res->constant_symbol=$arr[2];
     						$res->specific_symbol=$arr[0];
     						break;
     					case 5:			// částka
     						$res->castka=$this->get_castka($field);
     						break;
     					case 6: 		// poplatek
     						$res->poplatek=$this->get_castka($field);
     						if (isset($this->callback)) call_user_func($this->callback, $res);
     						/**
     						 * ted uz muzeme ulozit ziskane data do databaze:
     						 */
     						break;
+    				}
+    			}
     		} while ( $status!== false );
+    	}
     	/**
     	 * set_callback umožňuje nastavit funkci, která se bude volat po naparsování
     	 * každého řádku HTML tabulky.
     	 * @param string|mixed $function Jméno callback funkce jako string. Pokud
     	 * je funkce v nějaké třídě, pak jako parametr předejte pole array(trida,jmeno_fce)
     	 */
     	public function set_callback($function, $param=NULL) {
     		$this->callback=$function;
     		if (!isset($param)) $this->result=new stdClass();
     		else $this->result=$param;
+    	}
     	public function parse($url) {
     		$this->open($url);
     		$this->get_charset();
     		/**
     		 * Now: search for the begining of the table or the date
     		 */
     		$found=$this->find_tags_and_trim(array(self::YEAR_STRING, self::START_STRING));
     		switch ($found) {
     			case 0:		// období výpisu nalezeno = standardní (netransparentní) výpis
     				$transparent=false;
     				$this->get_year();
     				if (!$this->find_tag_and_trim(self::START_STRING))
     					die ("Nemohu najít začátek tabulky: '".self::START_STRING."'");
     				break;
     			case 1: 	//období výpisu nenalezeno - transparentní výpis
     				$transparent=true;
     				$this->year=date("Y");
     				break;
     			case 2:
     				die ("Nemohu najít začátek tabulky nebo datum/rok");
     		};
     		if ($transparent)
     			$this->get_data_from_transparent();
     		else
     			$this->get_data_from_regular();
     		fclose($this->file);
+    	}
+    }
     ?>

     <?php
     /**
      * @author Tomas <Dulik at unart dot cz>
      * @version 1.0
      * Parser_Html_Table is ABSTRACT class containing methods useful
      * for parsing HTML tables in generic HTML files.
+     *
      * Motivation: we want to parse HTML tables to get interesting data from various web sites.
      * The HTML code of the tables often does not conforms to XML/XHTML rules.
      * It often does not conform even HTML4, e.g. - the table row is not closed by </tr>,
      * table cell is not closed by </td> etc.
      * Therefore, XML parsers can't be used for this.
      * The Tidy extension is not available on all hostings.
      * If you think about parsing a non-XHTML non-HTML4.0 table, look at this class.
      * The methods have been optimized to give maximum possible performance
      * and memory efficiency.
      * For an example how to use this class, see the Parser_Ebanka class
      */
     abstract class Parser_Html_Table {
     	const TIMEOUT=3;
     	protected $file;
     	protected $charset;
     	protected $buffer;
     	protected $eoln_pos;			// the position of the last End Of Line in the buffer
     	protected $matches;
     	public function open($url) {
     		if ($url!="") {
     			$old = ini_set('default_socket_timeout', self::TIMEOUT);
     			if (($this->file=fopen($url, "rb"))===false)
     				die ("Can not open file! Check if $url exists!");
     			ini_set('default_socket_timeout', $old);
     			stream_set_timeout($this->file, self::TIMEOUT);
     			//stream_set_blocking($this->file, 0);
+    		}
+    	}
     	/**
     	* get_line appends **AT LEAST** one line from the $file into the $buffer.
+    	*
+    	*
     	* @return boolean
     	* @uses buffer, eoln_pos;
     	* In PHP4, this is MUCH faster than using fgets because of a PHP bug.
     	* In PHP5, this is usualy still faster than the following version based on fgets:
+    	*
     	* protected function get_line_fgets() {
     	*	if (!feof($this->file))
     	*		$this->buffer .= fgets($this->file);
     	*	else return false;
     	*	$this->eoln_pos=strlen($this->buffer);
     	*	return true;
     	* }
+    	*
     	* Note for HTML files with super long lines (hundreds of kbytes without single
     	* EOLN) the fgets would be useless - it'd take a lot of memory to read a single line!
     	* For such files, you should modify the code of my function this way:
     	* Replace
     	* 	...eoln_pos=strripos($this->buffer,"\n"))
     	* by something like
     	* 	...eoln_pos=find_row_end()
     	*/
     	public function get_line() {
     		while (!feof($this->file)) {
     			$new_part = fread($this->file, 8192);
     			$this->buffer .= $new_part;			// read 8192 bytes from file or one packet
     			if (($this->eoln_pos=strripos($this->buffer,"\n"))!==false)  // search eoln from end: found ?
     				return true;						// eoln found! done, OK...
+    		}
     													// EOF happened ?
     		if (!isset($new_part)) return false;		// EOF right when the function begun? Return EOF!
     													// EOF happened but no EOLN
     		$this->eoln_pos=strlen($this->buffer);		// set eoln_pos to EOF...
     		return true;
+    	}
     	/**
     	 * find_tag_and_trim($tag) tries to find the tag in the $this->buffer
     	 * and trim the beginning of the buffer till (and including) the $tag
     	 * returns false if string not found.
     	 * returns true if string found, and the variable $this->buffer contains
     	 * string trimmed from the first occurence of $tag
     	 */
     	protected function find_tag_and_trim($tag) {
     		$found=false;
     		do {
     			if (($pos=stripos($this->buffer, $tag))!==false) { // can you find the startag ?
     				$found=true;						// yes!
     				$pos+=strlen($tag);			// set the cut $pos(ition) behind $tag
     			} else $pos=$this->eoln_pos;			// no? Then set the cut $pos(ition) to the
     													// last eoln in buffer, so we don't have to deal with these lines again
     													// The last "non-terminated-yet" line (line without EOLN) must stay in the buffer!
     			$this->buffer=substr($this->buffer, $pos);  // now cut away everything from the beginning till the cut position
     													// and update the counters
     			$this->eoln_pos -= $pos;
     		} while ( !$found && $this->get_line() );
     		return $found;
+    	}
     	/**
     	 * the same as previous function, but for multiple tags search.
     	 * if tag is found, returns the tag index in the $tags array.
     	 * if tag is not found, returns number of $tags+1
     	 */
     	protected function find_tags_and_trim($tags) {
     		$found=false;
     		do {
     			$i=0;
     			foreach ($tags as $tag)				// for all the tags do:
     				if (($pos=stripos($this->buffer, $tag))!==false) { // can you find the startag ?
     					$found=true;				// yes!
     					$pos+=strlen($tag);			// set the cut $pos(ition) behind $tag
     					break;
     				} else $i++;					// this tag not found - increment cntr and try another one
     			if (!$found) 						// no tag found at all?
     				$pos=$this->eoln_pos;	 		// Then set the cut $pos(ition) to the
     				// last eoln in buffer, so we don't have to deal with these lines again
     				// The last "non-terminated-yet" line (line without EOLN) must stay in the buffer!
     			$this->buffer=substr($this->buffer, $pos);  // now cut away everything from the beginning till the cut position
     			$this->eoln_pos -= $pos;					// and update the counters
     		} while ( !$found && $this->get_line() );
     		return $i;
+    	}
     	/**
     	 * this functions tries to find the end of table row.
     	 * It can handle even rows terminated incorrectly by
     	 * </table> instead of </tr>
     	 * Returns: the position of the end row tag (</tr> or </table>)
     	 * 			or false if the tag is not found.
     	 */
     	protected function find_row_end() {
     		/**
     		 * PHP5 version: in PHP5, strripos can search whole string,
     		 * not only 1 char as in PHP4
     		 */
     		if ( ($res=strripos($this->buffer, "</tr")) !== false)
     			return $res;
     		else if ( ($res=strripos($this->buffer, "<tr")))
     			 	return $res;
     			 else return strripos($this->buffer, "</table");
     		/**
     		 * PHP4 version: we have to use perl regular expressions...
     		 * This is only 0.03sec/100kB slower than PHP5 strripos version
     		*/
     		/*
     		$matchCnt=preg_match("/<[\/]?(?:tr|table)(?!.*<[\/]?(tr|table))/si",$this->buffer, $matches, PREG_OFFSET_CAPTURE);
     		if ($matchCnt==1) return $matches[0][1];
     		else return false;
     		*/
+    	}
     	/**
     	 * get_table_rows tries to fill the buffer with at least one table row (<tr>...<[/]tr>) string.
     	 * It then parses the rows using a regular expression, which returns the content of the
     	 * table cells in the $this->matches array
     	 * Because fread reads whole blocks, it is possible this
     	 */
     	protected function get_table_rows() {
     		/* Try to find the starting <tr> tag: */
     		if (! $this->find_tag_and_trim("<tr") ) return false;
     		/** now try to find the last <[/]tr> or <[/]table> tag by searching these tags not followed by the same tags,
     			if not successfull, read the next line of the file */
     		while ( ($lastTagPos=$this->find_row_end()) === false
     				&& $this->get_line());
     		if ($lastTagPos===false) return false;  // if <tr> not found untill EOF, return EOF
     		$rows=substr($this->buffer,0, $lastTagPos);	// $rows is string containing several <tr>...<tr>... ended by <tr>
     		if (strcasecmp($this->charset, "utf-8")!=0) // if HTML charset is not UTF-8
     			$rows=iconv($this->charset, "UTF-8", $rows); // convert it to UTF-8
     		/**
     		 * Now: get the contents of all the table cells (the texts between
     		 * <td > and <td > or </td> or <tr> or </tr> tags
     		 */
     		preg_match_all("/<td[^>]*>(?:<[^>]*>)?(.*?)<(?:(?:\/)?td|tr)/si", $rows, $this->matches);
     		$this->buffer=substr($this->buffer,$lastTagPos);
     		return true;
+    	}
     	protected function get_charset() {
     		if (! $this->find_tag_and_trim("charset=")) $this->charset="utf-8"; // pokud charset chybí, nastav utf8
     		else {
     			if (($quotesPos=strpos($this->buffer, '"'))===false)	// zkus najít uvozovky
     				if (($quotesPos=strpos($this->buffer, "'"))===false)
     					die("Can't find the quotes after 'charset=...'");
     			$this->charset = substr($this->buffer, 0, $quotesPos);
+    		}
+    	}
     	abstract function parse($url);
+    }
     ?>

Také k dispozici: Unified diff

Projekt

Obecné

Profil

FreenetIS

Revize 126

Přidáno uživatelem Tomáš Dulík před téměř 16 roky(ů)