Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

oc2017 #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# Ignore output of scraper
data.sqlite
scraperwiki.sqlite
.idea
.DS_Store
10 changes: 10 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
source 'https://rubygems.org'
ruby '2.4.1'

gem 'watir'
gem 'nokogiri'
gem 'mechanize'
gem 'digest'
gem 'scraperwiki', git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"


68 changes: 68 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
childprocess (0.8.0)
ffi (~> 1.0, >= 1.0.11)
digest (0.0.1)
domain_name (0.5.20170404)
unf (>= 0.0.5, < 1.0.0)
ffi (1.9.18)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.8.3)
mechanize (2.7.5)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.3.0)
net-http-digest_auth (1.4.1)
net-http-persistent (2.9.4)
nokogiri (1.8.1)
mini_portile2 (~> 2.3.0)
ntlm-http (0.1.1)
rubyzip (1.2.1)
selenium-webdriver (3.6.0)
childprocess (~> 0.5)
rubyzip (~> 1.0)
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.4)
watir (6.8.4)
selenium-webdriver (~> 3.4, >= 3.4.1)
webrobots (0.1.2)

PLATFORMS
ruby

DEPENDENCIES
digest
mechanize
nokogiri
scraperwiki!
watir

RUBY VERSION
ruby 2.4.1p111

BUNDLED WITH
1.15.4
102 changes: 61 additions & 41 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,57 +1,77 @@
require 'scraperwiki'
require 'rubygems'
require 'mechanize'
require 'json'
require 'nokogiri'
require 'pp'
require 'nokogiri'
require 'mechanize'
require 'watir'
require 'digest'
require 'scraperwiki'

BASE_URL='http://www.nrsr.sk/web/Default.aspx?sid=poslanci/ospravedlnenia_result'

BASE_CAL_URL='http://www.nrsr.sk/web/Services/CalendarService.asmx/RenderCalendar?month=10&year=2011&app="nrdvp"&lang=""'


@agent = Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}
BASE_URL = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/ospravedlnenia_result'.freeze
URL_SUF = '&DatumOd=1900-1-1%200:0:0&DatumDo=2100-1-1%200:0:0&CisSchodze='.freeze
PL = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia='.freeze

def excuse_url(m)
(1..5).each { |meeting_num|
yield :name => m[:name],
:url => "http://www.nrsr.sk/web/Default.aspx?sid=poslanci/ospravedlnenia_result&PoslanecMasterID="+m[:id]+
"&CisObdobia=#{meeting_num}&DatumOd=1900-1-1%200:0:0&DatumDo=2100-1-1%200:0:0&CisSchodze="
}
end
@agent = Mechanize.new

def mop_list
list = []
@agent.get(BASE_URL) do |page|
page.at('div#_sectionLayoutContainer__panelContent').search('select#_sectionLayoutContainer_ctl01_PoslanecMasterID option').each do |opt|
next if opt.attr('value') == '-1'
yield :id => opt.attr('value'), :name => opt.text
def list_mops
page = @agent.get('http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia=1')
termx = page.xpath('//select[@id = "_sectionLayoutContainer_ctl01__currentTerm"]//@value').map(&:value).max.to_i
(2..termx).each do |term|
pp "in term #{term}"
page = @agent.get("#{PL}#{term}")
page.xpath('//div[@class = "mps_list"]//li//a').each do |member|
l = member.attr('href')
i = l.match('.*PoslanecID=(.*)&.*')
yield mop_id: i[1], name: member.text, term: term, url: "http://www.nrsr.sk/#{l}"
# p id: i[1], name: member.text, term: term, url: "http://www.nrsr.sk/#{l}"
end
end
list
end

def list_excuses
mop_list do |mop|
excuse_url(mop) do |url|
@agent.get(url[:url]) do |page|
next unless page.at('table.tab_zoznam')
page.at('table.tab_zoznam').search('tr').each do |r|
next if r.attr('class') == 'tab_zoznam_header'
yield :name => url[:name], :date => r.search('td')[2].text.strip, :party => r.search('td')[1].text.strip, :reason => r.search('td')[3].text.strip
end
def pager(url)
pages = []
page = @agent.get(url)
pages.push(page) if page.at('table.tab_zoznam')
if page.at('//table[@class="tab_zoznam"]//table')
links = page.xpath('//table[@class="tab_zoznam"]//table//tr/td//@href').map(&:value).uniq
links.each do |link|
link.slice! 'javascript:'
begin
br = Watir::Browser.start(url, :phantomjs)
rescue Net::ReadTimeout, Net::HTTPRequestTimeOut, Errno::ETIMEDOUT, Errno::ECONNREFUSED => ex
puts "#{ex.class} detected, retrying"
retry
end
br.execute_script(link)
sleep 5
pages.push(Nokogiri::HTML(br.html))
end
end
pages
end

def make_uuid(i)
{ "unique_id" => "#{i[:name].downcase.gsub(/\W/, '-')}-#{i[:date].gsub(/(\W|\.)/, '-')}" }
def excuses
list_mops do |excuse|
excuse_url = "#{BASE_URL}&PoslanecMasterID=#{excuse[:mop_id]}&CisObdobia=#{excuse[:term]}#{URL_SUF}"
pages = pager(excuse_url)
pages.each do |page|
page.at('table.tab_zoznam').search('tr').each do |r|
next if r.attr('class') == 'tab_zoznam_header'
next if r.attr('class') == 'pager'
next if r.search('td')[0].text.strip.length <= 2
yield name: excuse[:name],
mop_id: excuse[:mop_id],
date: r.search('td')[2].text.strip,
term: excuse[:term],
party: r.search('td')[1].text.strip,
reason: r.search('td')[3].text.strip
end
end
end
end

list_excuses do |item|
pp item.merge(make_uuid(item))
ScraperWiki.save(['unique_id'], item.merge(make_uuid(item)))
p Time.now
excuses do |item|
id= { 'excuse_id' => Digest::MD5.hexdigest("#{item[:date].gsub(/[^0-9,.]/, '')}#{item[:mop_id]}") }
#p item.merge(id)
ScraperWiki.save_sqlite(['excuse_id'], item.merge(id))
end
p Time.now