From 7d8e232c0f37d903d848e08e88542a2a253a1e01 Mon Sep 17 00:00:00 2001 From: lesspointless Date: Thu, 26 Oct 2017 19:37:40 +0200 Subject: [PATCH] oc2017 --- .gitignore | 3 ++ Gemfile | 10 +++++ Gemfile.lock | 68 ++++++++++++++++++++++++++++++++++ scraper.rb | 102 ++++++++++++++++++++++++++++++--------------------- 4 files changed, 142 insertions(+), 41 deletions(-) create mode 100644 Gemfile create mode 100644 Gemfile.lock diff --git a/.gitignore b/.gitignore index 66d464d..451e77e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ # Ignore output of scraper data.sqlite +scraperwiki.sqlite +.idea +.DS_Store diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..0263519 --- /dev/null +++ b/Gemfile @@ -0,0 +1,10 @@ +source 'https://rubygems.org' +ruby '2.4.1' + +gem 'watir' +gem 'nokogiri' +gem 'mechanize' +gem 'digest' +gem 'scraperwiki', git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" + + diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..9a2ce78 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,68 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + childprocess (0.8.0) + ffi (~> 1.0, >= 1.0.11) + digest (0.0.1) + domain_name (0.5.20170404) + unf (>= 0.0.5, < 1.0.0) + ffi (1.9.18) + http-cookie (1.0.3) + domain_name (~> 0.5) + httpclient (2.8.3) + mechanize (2.7.5) + domain_name (~> 0.5, >= 0.5.1) + http-cookie (~> 1.0) + mime-types (>= 1.17.2) + net-http-digest_auth (~> 1.1, >= 1.1.1) + net-http-persistent (~> 2.5, >= 2.5.2) + nokogiri (~> 1.6) + ntlm-http (~> 0.1, >= 0.1.1) + webrobots (>= 0.0.9, < 0.2) + mime-types (3.1) + mime-types-data (~> 3.2015) + mime-types-data (3.2016.0521) + mini_portile2 (2.3.0) + net-http-digest_auth (1.4.1) + net-http-persistent (2.9.4) + nokogiri (1.8.1) + mini_portile2 (~> 2.3.0) + ntlm-http (0.1.1) + rubyzip (1.2.1) + selenium-webdriver (3.6.0) + childprocess (~> 0.5) + rubyzip (~> 1.0) + sqlite3 (1.3.13) + sqlite_magic (0.0.6) + sqlite3 + unf (0.1.4) + unf_ext + unf_ext (0.0.7.4) + watir (6.8.4) + selenium-webdriver (~> 3.4, >= 3.4.1) + webrobots (0.1.2) + +PLATFORMS + ruby + +DEPENDENCIES + digest + mechanize + nokogiri + scraperwiki! + watir + +RUBY VERSION + ruby 2.4.1p111 + +BUNDLED WITH + 1.15.4 diff --git a/scraper.rb b/scraper.rb index 4bb57d6..9f36589 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,57 +1,77 @@ -require 'scraperwiki' -require 'rubygems' -require 'mechanize' -require 'json' -require 'nokogiri' require 'pp' +require 'nokogiri' +require 'mechanize' +require 'watir' +require 'digest' +require 'scraperwiki' -BASE_URL='http://www.nrsr.sk/web/Default.aspx?sid=poslanci/ospravedlnenia_result' - -BASE_CAL_URL='http://www.nrsr.sk/web/Services/CalendarService.asmx/RenderCalendar?month=10&year=2011&app="nrdvp"&lang=""' - - -@agent = Mechanize.new { |agent| - agent.user_agent_alias = 'Mac Safari' -} +BASE_URL = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/ospravedlnenia_result'.freeze +URL_SUF = '&DatumOd=1900-1-1%200:0:0&DatumDo=2100-1-1%200:0:0&CisSchodze='.freeze +PL = 'http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia='.freeze -def excuse_url(m) - (1..5).each { |meeting_num| - yield :name => m[:name], - :url => "http://www.nrsr.sk/web/Default.aspx?sid=poslanci/ospravedlnenia_result&PoslanecMasterID="+m[:id]+ - "&CisObdobia=#{meeting_num}&DatumOd=1900-1-1%200:0:0&DatumDo=2100-1-1%200:0:0&CisSchodze=" - } -end +@agent = Mechanize.new -def mop_list - list = [] - @agent.get(BASE_URL) do |page| - page.at('div#_sectionLayoutContainer__panelContent').search('select#_sectionLayoutContainer_ctl01_PoslanecMasterID option').each do |opt| - next if opt.attr('value') == '-1' - yield :id => opt.attr('value'), :name => opt.text +def list_mops + page = @agent.get('http://www.nrsr.sk/web/Default.aspx?sid=poslanci/zoznam_abc&ListType=0&CisObdobia=1') + termx = page.xpath('//select[@id = "_sectionLayoutContainer_ctl01__currentTerm"]//@value').map(&:value).max.to_i + (2..termx).each do |term| + pp "in term #{term}" + page = @agent.get("#{PL}#{term}") + page.xpath('//div[@class = "mps_list"]//li//a').each do |member| + l = member.attr('href') + i = l.match('.*PoslanecID=(.*)&.*') + yield mop_id: i[1], name: member.text, term: term, url: "http://www.nrsr.sk/#{l}" + # p id: i[1], name: member.text, term: term, url: "http://www.nrsr.sk/#{l}" end end - list end -def list_excuses - mop_list do |mop| - excuse_url(mop) do |url| - @agent.get(url[:url]) do |page| - next unless page.at('table.tab_zoznam') - page.at('table.tab_zoznam').search('tr').each do |r| - next if r.attr('class') == 'tab_zoznam_header' - yield :name => url[:name], :date => r.search('td')[2].text.strip, :party => r.search('td')[1].text.strip, :reason => r.search('td')[3].text.strip - end +def pager(url) + pages = [] + page = @agent.get(url) + pages.push(page) if page.at('table.tab_zoznam') + if page.at('//table[@class="tab_zoznam"]//table') + links = page.xpath('//table[@class="tab_zoznam"]//table//tr/td//@href').map(&:value).uniq + links.each do |link| + link.slice! 'javascript:' + begin + br = Watir::Browser.start(url, :phantomjs) + rescue Net::ReadTimeout, Net::HTTPRequestTimeOut, Errno::ETIMEDOUT, Errno::ECONNREFUSED => ex + puts "#{ex.class} detected, retrying" + retry end + br.execute_script(link) + sleep 5 + pages.push(Nokogiri::HTML(br.html)) end end + pages end -def make_uuid(i) - { "unique_id" => "#{i[:name].downcase.gsub(/\W/, '-')}-#{i[:date].gsub(/(\W|\.)/, '-')}" } +def excuses + list_mops do |excuse| + excuse_url = "#{BASE_URL}&PoslanecMasterID=#{excuse[:mop_id]}&CisObdobia=#{excuse[:term]}#{URL_SUF}" + pages = pager(excuse_url) + pages.each do |page| + page.at('table.tab_zoznam').search('tr').each do |r| + next if r.attr('class') == 'tab_zoznam_header' + next if r.attr('class') == 'pager' + next if r.search('td')[0].text.strip.length <= 2 + yield name: excuse[:name], + mop_id: excuse[:mop_id], + date: r.search('td')[2].text.strip, + term: excuse[:term], + party: r.search('td')[1].text.strip, + reason: r.search('td')[3].text.strip + end + end + end end -list_excuses do |item| - pp item.merge(make_uuid(item)) - ScraperWiki.save(['unique_id'], item.merge(make_uuid(item))) +p Time.now +excuses do |item| + id= { 'excuse_id' => Digest::MD5.hexdigest("#{item[:date].gsub(/[^0-9,.]/, '')}#{item[:mop_id]}") } + #p item.merge(id) + ScraperWiki.save_sqlite(['excuse_id'], item.merge(id)) end +p Time.now