Skip to content

Commit

Permalink
Updating to use curl (like check-oa)
Browse files Browse the repository at this point in the history
  • Loading branch information
houshuang committed Dec 5, 2012
1 parent 387b3b5 commit 034fe3e
Showing 1 changed file with 120 additions and 25 deletions.
145 changes: 120 additions & 25 deletions update_urls.rb
Original file line number Diff line number Diff line change
@@ -1,45 +1,140 @@
# encoding: UTF-8

# researchr scripts relevant to BibDesk (the right one is executed from the bottom of the file)
# script that goes through all PDFs in the BibDesk PDF folder, checks if it has a Finder download URL,
# checks if that URL is OA, and if it is, adds the field OA-URL and URL to the bibtex entry

$:.push(File.dirname($0))
require 'utility-functions'
require 'appscript'
require 'cgi'
require 'net/http'
require 'open-uri'


# this script goes through all the BibDesk PDFs, extracts download URLs, and adds these fields to BibDesk

BibDesk = Appscript.app('BibDesk')

puts "Updating URLs on all files in #{PDF_path}"
def is_url(url)
return true if url.index("http")
end

# iterate through Bibdesk PDF directory
Dir.foreach(PDF_path) do |f|
next if f == '.' or f == '..'
next unless f[-4..-1].downcase == '.pdf'
docu = f[0..-5]
puts docu
def update_url(pub, url)
pub.fields["Url"].value.set(url)
pub.fields["OA-URL"].value.set(url)
puts "*" * 78
puts "OA! #{url}"
puts "*" * 78
end

a = `mdls -name kMDItemWhereFroms "#{PDF_path+"/"+f}"`
next unless a.index("http")
def checkOA(url)
return false unless is_url(url)
puts "Checking OA: #{url}"
res = checkOArun(url)
puts res ? "True" : "False"
return res
end

b = a.split('"')
def checkOArun(origurl)
url = origurl.gsub(/https?\:\/\/?/,'')
uri, *path = url.split("/")
path = "/" + path.join("/")
origurl.sub!(':/', '://') unless origurl.index("//")

pub = BibDesk.document.search({:for =>docu})[0]
chrome_agent = 'Mozilla/5.0 (X11; CrOS i686 1660.57.0) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.46 Safari/535.19'
curl_opts = "--connect-timeout 5 -A '#{chrome_agent}'"

# first check against whitelist
whitelist = [ # list of URLs that don't need to be downloaded to check, first is URI, second is path
[/arxiv\.org/, /\.pdf$/]
]
whitelist.each { |comp| return true if uri.index(comp[0]) && path.index(comp[1]) }

# faking agent, to avoid no-robots

# grab header using curl
response = `curl #{curl_opts} -I '#{origurl}'`

possible_ctypes = [
"application/pdf",
"application/x-pdf",
"application/vnd.pdf",
"application/text.pdf"]

# if ctype matches PDF, true, otherwise explore further
possible_ctypes.each {|ctype| return true if response.index("Content-Type: #{ctype}")}

# try curl
`curl #{curl_opts} -r 0-99 -s '#{origurl}' > output.tmp`

return (`file output.tmp;rm output.tmp`.index("PDF document") ? true : false)

# we tried, but we failed.
return false

pub.fields["Url"].value.set(b[1].gsub(".myaccess.library.utoronto.ca",""))
puts b[1]
if b[3] && b[3].scan(/\&q\=(.+?)\&/).size > 0
pub.fields["GScholar search term"].value.set CGI::unescape($~[1]).gsub('"','')
puts b[3]
end
puts "*" *40

end


# kMDItemWhereFroms = (
# "http://www.lancs.ac.uk/fss/organisations/netlc/past/nlc2010/abstracts/PDFs/Mackness.pdf",
# "http://scholar.google.com/scholar?q=mooc&hl=en&btnG=Search&as_sdt=1%2C5&as_sdtp=on"
# )
if __FILE__==$0
t = Time.now
puts "Updating URLs on all files in #{PDF_path}"

# logfiles
dontmatch = File.open('dontmatch.txt','w')
nourl = File.open('noturl.txt','w')
notoa = File.open('notoa.txt','w')
oa = File.open('oa.txt','w')


# iterate through Bibdesk PDF directory
Dir.foreach(PDF_path) do |f|
next if f == '.' or f == '..'
next unless f.size < 4 || f[-4..-1].downcase == '.pdf'

docu = f[0..-5]

p docu
pub = BibDesk.document.search({:for =>docu})

# PDF name doesn't match any citekeys
unless pub.class == Array && pub.size > 0
puts "#{docu}: Doesn't match citekey"
dontmatch << docu << "\n"
next
end

# already has OA pub
if pub[0].fields["OA-Url"].value.get.size > 0
puts "#{docu}: Already OA"
next
end

# if already has URL field, check if OA
url = try { pub[0].fields["URL"].value.get }
if is_url(url) && checkOA(url)
update_url(pub[0], url)
puts "#{docu}: URL OA"
oa << docu << "\n"
next
end

# try to get d/l URL from Finder metadata
a = `mdls -name kMDItemWhereFroms "#{PDF_path}/#{docu}.pdf"`

if url = try {a.split('"')[1]} && checkOA(url)
update_url(pub[0], url)
puts "#{docu}: Finder OA"
oa << docu << "\n"
next
end
if url.class == String
puts "#{docu}: No OA"
notoa << docu << "\n"
else
puts "#{docu}: No file"
nourl << docu << "\n"
end
end
puts Time.now-t

end

0 comments on commit 034fe3e

Please sign in to comment.