forked from houshuang/folders2web
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updating to use curl (like check-oa)
- Loading branch information
Showing
1 changed file
with
120 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,140 @@ | ||
# encoding: UTF-8 | ||
|
||
# researchr scripts relevant to BibDesk (the right one is executed from the bottom of the file) | ||
# script that goes through all PDFs in the BibDesk PDF folder, checks if it has a Finder download URL, | ||
# checks if that URL is OA, and if it is, adds the field OA-URL and URL to the bibtex entry | ||
|
||
$:.push(File.dirname($0)) | ||
require 'utility-functions' | ||
require 'appscript' | ||
require 'cgi' | ||
require 'net/http' | ||
require 'open-uri' | ||
|
||
|
||
# this script goes through all the BibDesk PDFs, extracts download URLs, and adds these fields to BibDesk | ||
|
||
BibDesk = Appscript.app('BibDesk') | ||
|
||
puts "Updating URLs on all files in #{PDF_path}" | ||
def is_url(url) | ||
return true if url.index("http") | ||
end | ||
|
||
# iterate through Bibdesk PDF directory | ||
Dir.foreach(PDF_path) do |f| | ||
next if f == '.' or f == '..' | ||
next unless f[-4..-1].downcase == '.pdf' | ||
docu = f[0..-5] | ||
puts docu | ||
def update_url(pub, url) | ||
pub.fields["Url"].value.set(url) | ||
pub.fields["OA-URL"].value.set(url) | ||
puts "*" * 78 | ||
puts "OA! #{url}" | ||
puts "*" * 78 | ||
end | ||
|
||
a = `mdls -name kMDItemWhereFroms "#{PDF_path+"/"+f}"` | ||
next unless a.index("http") | ||
def checkOA(url) | ||
return false unless is_url(url) | ||
puts "Checking OA: #{url}" | ||
res = checkOArun(url) | ||
puts res ? "True" : "False" | ||
return res | ||
end | ||
|
||
b = a.split('"') | ||
def checkOArun(origurl) | ||
url = origurl.gsub(/https?\:\/\/?/,'') | ||
uri, *path = url.split("/") | ||
path = "/" + path.join("/") | ||
origurl.sub!(':/', '://') unless origurl.index("//") | ||
|
||
pub = BibDesk.document.search({:for =>docu})[0] | ||
chrome_agent = 'Mozilla/5.0 (X11; CrOS i686 1660.57.0) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.46 Safari/535.19' | ||
curl_opts = "--connect-timeout 5 -A '#{chrome_agent}'" | ||
|
||
# first check against whitelist | ||
whitelist = [ # list of URLs that don't need to be downloaded to check, first is URI, second is path | ||
[/arxiv\.org/, /\.pdf$/] | ||
] | ||
whitelist.each { |comp| return true if uri.index(comp[0]) && path.index(comp[1]) } | ||
|
||
# faking agent, to avoid no-robots | ||
|
||
# grab header using curl | ||
response = `curl #{curl_opts} -I '#{origurl}'` | ||
|
||
possible_ctypes = [ | ||
"application/pdf", | ||
"application/x-pdf", | ||
"application/vnd.pdf", | ||
"application/text.pdf"] | ||
|
||
# if ctype matches PDF, true, otherwise explore further | ||
possible_ctypes.each {|ctype| return true if response.index("Content-Type: #{ctype}")} | ||
|
||
# try curl | ||
`curl #{curl_opts} -r 0-99 -s '#{origurl}' > output.tmp` | ||
|
||
return (`file output.tmp;rm output.tmp`.index("PDF document") ? true : false) | ||
|
||
# we tried, but we failed. | ||
return false | ||
|
||
pub.fields["Url"].value.set(b[1].gsub(".myaccess.library.utoronto.ca","")) | ||
puts b[1] | ||
if b[3] && b[3].scan(/\&q\=(.+?)\&/).size > 0 | ||
pub.fields["GScholar search term"].value.set CGI::unescape($~[1]).gsub('"','') | ||
puts b[3] | ||
end | ||
puts "*" *40 | ||
|
||
end | ||
|
||
|
||
# kMDItemWhereFroms = ( | ||
# "http://www.lancs.ac.uk/fss/organisations/netlc/past/nlc2010/abstracts/PDFs/Mackness.pdf", | ||
# "http://scholar.google.com/scholar?q=mooc&hl=en&btnG=Search&as_sdt=1%2C5&as_sdtp=on" | ||
# ) | ||
if __FILE__==$0 | ||
t = Time.now | ||
puts "Updating URLs on all files in #{PDF_path}" | ||
|
||
# logfiles | ||
dontmatch = File.open('dontmatch.txt','w') | ||
nourl = File.open('noturl.txt','w') | ||
notoa = File.open('notoa.txt','w') | ||
oa = File.open('oa.txt','w') | ||
|
||
|
||
# iterate through Bibdesk PDF directory | ||
Dir.foreach(PDF_path) do |f| | ||
next if f == '.' or f == '..' | ||
next unless f.size < 4 || f[-4..-1].downcase == '.pdf' | ||
|
||
docu = f[0..-5] | ||
|
||
p docu | ||
pub = BibDesk.document.search({:for =>docu}) | ||
|
||
# PDF name doesn't match any citekeys | ||
unless pub.class == Array && pub.size > 0 | ||
puts "#{docu}: Doesn't match citekey" | ||
dontmatch << docu << "\n" | ||
next | ||
end | ||
|
||
# already has OA pub | ||
if pub[0].fields["OA-Url"].value.get.size > 0 | ||
puts "#{docu}: Already OA" | ||
next | ||
end | ||
|
||
# if already has URL field, check if OA | ||
url = try { pub[0].fields["URL"].value.get } | ||
if is_url(url) && checkOA(url) | ||
update_url(pub[0], url) | ||
puts "#{docu}: URL OA" | ||
oa << docu << "\n" | ||
next | ||
end | ||
|
||
# try to get d/l URL from Finder metadata | ||
a = `mdls -name kMDItemWhereFroms "#{PDF_path}/#{docu}.pdf"` | ||
|
||
if url = try {a.split('"')[1]} && checkOA(url) | ||
update_url(pub[0], url) | ||
puts "#{docu}: Finder OA" | ||
oa << docu << "\n" | ||
next | ||
end | ||
if url.class == String | ||
puts "#{docu}: No OA" | ||
notoa << docu << "\n" | ||
else | ||
puts "#{docu}: No file" | ||
nourl << docu << "\n" | ||
end | ||
end | ||
puts Time.now-t | ||
|
||
end |