using xpath to scrape web

This commit is contained in:
Peter Boráros 2013-06-03 19:32:07 +02:00
parent 55f9618a24
commit 323cf99b79
2 changed files with 14 additions and 3 deletions

View file

@ -1,12 +1,22 @@
HAML=haml HAML=haml
XMLLINT=xmllint
all: index.html doprava.html all: index.html doprava.html
refresh: get:
curl 'http://vvv.chmi.cz/hydro/detail_stanice/307225.html' | sed -n '153,302 p' > data/table.html curl 'http://vvv.chmi.cz/hydro/detail_stanice/307225.html' > data/307225.html
wget 'http://vvv.chmi.cz/hydro/graph/big/307225_H.png' -O data/stav.png wget 'http://vvv.chmi.cz/hydro/graph/big/307225_H.png' -O data/stav.png
wget 'http://vvv.chmi.cz/hydro/graph/big/307225_Q.png' -O data/prutok.png wget 'http://vvv.chmi.cz/hydro/graph/big/307225_Q.png' -O data/prutok.png
scrap: get
xmllint --html --encode utf8 data/307225.html --output data/307225.html
xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/p[2]/text()' data/307225.html > data/timestamp.html
xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/table[2]//table[5]/tr[position()>1]' data/307225.html > data/table.html
# xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/table[2]//table[3]//tr' data/307225.html | iconv -f cp1250 -t utf8 > data/legend.html
# xmllint --html --xpath '//div[@class="box"]/div[@class="cont"]/table[2]//table[1]//tr' data/307225.html | iconv -f cp1250 -t utf8 > data/info.html
refresh: get scrap
%.html: %.haml %.html: %.haml
$(HAML) $< $@ $(HAML) $< $@

View file

@ -13,7 +13,8 @@
%li <a href='doprava.html'>Doprava</a> %li <a href='doprava.html'>Doprava</a>
.content .content
%p
=File.read('data/timestamp.html')
%table.twocolumn %table.twocolumn
%tr %tr
%td(style="padding: 16px;") %td(style="padding: 16px;")