From 323cf99b7938ca44123ca4356ae1abd5b4ca58fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Bora=CC=81ros?= Date: Mon, 3 Jun 2013 19:32:07 +0200 Subject: [PATCH 1/2] using xpath to scrape web --- Makefile | 14 ++++++++++++-- index.haml | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 594527a..da5dd38 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,22 @@ HAML=haml +XMLLINT=xmllint all: index.html doprava.html -refresh: - curl 'http://vvv.chmi.cz/hydro/detail_stanice/307225.html' | sed -n '153,302 p' > data/table.html +get: + curl 'http://vvv.chmi.cz/hydro/detail_stanice/307225.html' > data/307225.html wget 'http://vvv.chmi.cz/hydro/graph/big/307225_H.png' -O data/stav.png wget 'http://vvv.chmi.cz/hydro/graph/big/307225_Q.png' -O data/prutok.png +scrap: get + xmllint --html --encode utf8 data/307225.html --output data/307225.html + xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/p[2]/text()' data/307225.html > data/timestamp.html + xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/table[2]//table[5]/tr[position()>1]' data/307225.html > data/table.html +# xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/table[2]//table[3]//tr' data/307225.html | iconv -f cp1250 -t utf8 > data/legend.html +# xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/table[2]//table[1]//tr' data/307225.html | iconv -f cp1250 -t utf8 > data/info.html + +refresh: get scrap + %.html: %.haml $(HAML) $< $@ diff --git a/index.haml b/index.haml index 244dc26..978c445 100644 --- a/index.haml +++ b/index.haml @@ -13,7 +13,8 @@ %li Doprava .content - + %p + =File.read('data/timestamp.html') %table.twocolumn %tr %td(style="padding: 16px;") From d9e08dfb81fa789875761ecacdd33edae0bfdac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20Bora=CC=81ros?= Date: Mon, 3 Jun 2013 20:26:17 +0200 Subject: [PATCH 2/2] nicer Makefile --- Makefile | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 16f744d..65eefea 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,24 @@ HAML=haml XMLLINT=xmllint +# option #! +DATAURL=http://hydro.chmi.cz/hpps/popup_hpps_prfdyn.php?seq=307225A +IMGURL1=http://hydro.chmi.cz/hpps/tmp/img/big/307225_H.png +IMGURL2=http://hydro.chmi.cz/hpps/tmp/img/big/307225_Q.png DATAPATH=//table[@class="stdstationtbl"]/./tr[3]//table/tr[position()>1] +# oprion #2 +#DATAURL=http://vvv.chmi.cz/hydro/detail_stanice/307225.html +#IMGURL1=http://vvv.chmi.cz/hydro/graph/big/307225_H.png +#IMGURL2=http://vvv.chmi.cz/hydro/graph/big/307225_Q.png +#DATAPATH=//table[2]//tr[3]/td//table//tr[position()>1] + all: index.html doprava.html refresh: - wget 'http://hydro.chmi.cz/hpps/tmp/img/big/307225_H.png' -O data/stav.png - wget 'http://hydro.chmi.cz/hpps/tmp/img/big/307225_Q.png' -O data/prutok.png - $(XMLLINT) --html --encode utf8 http://hydro.chmi.cz/hpps/popup_hpps_prfdyn.php?seq=307225 --output data/307225.html - $(XMLLINT) --html --xpath '$(DATAPATH)' data/307225.html > data/table.html - + wget '$(IMGURL1)' -O data/stav.png + wget '$(IMGURL2)' -O data/prutok.png + wget '$(DATAURL)' -O - | $(XMLLINT) --html --encode utf8 - | $(XMLLINT) --html --xpath '$(DATAPATH)' - > data/table.html %.html: %.haml $(HAML) $< $@