Updating to use curl (like check-oa)

pygospa · Dec 5, 2012 · 034fe3e · 034fe3e
1 parent 387b3b5
commit 034fe3e
Showing 1 changed file with 120 additions and 25 deletions.
diff --git a/update_urls.rb b/update_urls.rb
@@ -1,45 +1,140 @@
 # encoding: UTF-8
 
-# researchr scripts relevant to BibDesk (the right one is executed from the bottom of the file)
+# script that goes through all PDFs in the BibDesk PDF folder, checks if it has a Finder download URL,
+# checks if that URL is OA, and if it is, adds the field OA-URL and URL to the bibtex entry
 
 $:.push(File.dirname($0))
 require 'utility-functions'
 require 'appscript'
 require 'cgi'
+require 'net/http'
+require 'open-uri'
+
 
 # this script goes through all the BibDesk PDFs, extracts download URLs, and adds these fields to BibDesk
 
 BibDesk = Appscript.app('BibDesk')
 
-puts "Updating URLs on all files in #{PDF_path}"
+def is_url(url)
+  return true if url.index("http")
+end
 
-# iterate through Bibdesk PDF directory
-Dir.foreach(PDF_path) do |f|
-  next if f == '.' or f == '..'
-  next unless f[-4..-1].downcase == '.pdf'
-   
-  docu = f[0..-5]
-  puts docu
+def update_url(pub, url)
+  pub.fields["Url"].value.set(url)
+  pub.fields["OA-URL"].value.set(url)
+  puts "*" * 78
+  puts "OA! #{url}"
+  puts "*" * 78
+end
 
-  a = `mdls -name kMDItemWhereFroms "#{PDF_path+"/"+f}"`
-  next unless a.index("http")
+def checkOA(url)
+  return false unless is_url(url)
+  puts "Checking OA: #{url}"
+  res = checkOArun(url)
+  puts res ? "True" : "False"
+  return res
+end
 
-  b = a.split('"')
+def checkOArun(origurl)
+  url = origurl.gsub(/https?\:\/\/?/,'')
+  uri, *path = url.split("/")
+  path = "/" + path.join("/")
+  origurl.sub!(':/', '://') unless origurl.index("//")
 
-  pub = BibDesk.document.search({:for =>docu})[0]
+  chrome_agent = 'Mozilla/5.0 (X11; CrOS i686 1660.57.0) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.46 Safari/535.19'
+  curl_opts = "--connect-timeout 5 -A '#{chrome_agent}'"
+
+  # first check against whitelist
+  whitelist = [ # list of URLs that don't need to be downloaded to check, first is URI, second is path
+    [/arxiv\.org/, /\.pdf$/]
+  ]
+  whitelist.each { |comp| return true if uri.index(comp[0]) && path.index(comp[1]) }
+
+  # faking agent, to avoid no-robots
+
+  # grab header using curl
+  response = `curl #{curl_opts} -I '#{origurl}'`
+
+  possible_ctypes = [
+    "application/pdf",
+    "application/x-pdf",
+    "application/vnd.pdf",
+    "application/text.pdf"]
+
+  # if ctype matches PDF, true, otherwise explore further
+  possible_ctypes.each {|ctype| return true if response.index("Content-Type: #{ctype}")}
+
+  # try curl
+  `curl #{curl_opts} -r 0-99 -s '#{origurl}' > output.tmp`
+
+  return (`file output.tmp;rm output.tmp`.index("PDF document") ? true : false)
+
+  # we tried, but we failed.
+  return false
 
-  pub.fields["Url"].value.set(b[1].gsub(".myaccess.library.utoronto.ca",""))
-  puts b[1]
-  if b[3] && b[3].scan(/\&q\=(.+?)\&/).size > 0
-    pub.fields["GScholar search term"].value.set CGI::unescape($~[1]).gsub('"','')
-    puts b[3]
-  end
-  puts "*" *40
-
 end
 
 
-# kMDItemWhereFroms = (
-#     "http://www.lancs.ac.uk/fss/organisations/netlc/past/nlc2010/abstracts/PDFs/Mackness.pdf",
-#     "http://scholar.google.com/scholar?q=mooc&hl=en&btnG=Search&as_sdt=1%2C5&as_sdtp=on"
-# )
+if __FILE__==$0
+  t = Time.now
+  puts "Updating URLs on all files in #{PDF_path}"
+
+  # logfiles
+  dontmatch = File.open('dontmatch.txt','w')
+  nourl = File.open('noturl.txt','w')
+  notoa = File.open('notoa.txt','w')
+  oa = File.open('oa.txt','w')
+
+
+  # iterate through Bibdesk PDF directory
+  Dir.foreach(PDF_path) do |f|
+    next if f == '.' or f == '..'
+    next unless f.size < 4 || f[-4..-1].downcase == '.pdf'
+
+    docu = f[0..-5]
+
+    p docu
+    pub = BibDesk.document.search({:for =>docu})
+
+    # PDF name doesn't match any citekeys
+    unless pub.class == Array && pub.size > 0
+      puts "#{docu}: Doesn't match citekey"
+      dontmatch << docu << "\n"
+      next
+    end
+
+    # already has OA pub
+    if pub[0].fields["OA-Url"].value.get.size > 0
+      puts "#{docu}: Already OA"
+      next
+    end
+
+    # if already has URL field, check if OA
+    url = try { pub[0].fields["URL"].value.get }
+    if is_url(url) && checkOA(url)
+      update_url(pub[0], url)
+      puts "#{docu}: URL OA"
+      oa << docu << "\n"
+      next
+    end
+
+    # try to get d/l URL from Finder metadata
+    a = `mdls -name kMDItemWhereFroms "#{PDF_path}/#{docu}.pdf"`
+
+    if url = try {a.split('"')[1]} && checkOA(url)
+      update_url(pub[0], url)
+      puts "#{docu}: Finder OA"
+      oa << docu << "\n"
+      next
+    end
+    if url.class == String
+      puts "#{docu}: No OA"
+      notoa << docu << "\n"
+    else
+      puts "#{docu}: No file"
+      nourl << docu << "\n"
+    end
+  end
+  puts Time.now-t
+
+end