-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGet_CfA_Bibliography_Info.py
140 lines (108 loc) · 3.97 KB
/
Get_CfA_Bibliography_Info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# coding: utf-8
import requests
import json
import csv
import time
import codecs
import cStringIO
from datetime import datetime
import requests.packages.urllib3
requests.packages.urllib3.disable_warnings()
#NOTE: typical ADS API users have a limit of 50,000 total results and 200 results per page.
#As of Nov 12, 2014 this script is retrieving 44,451 results, so we're coming close to
#the limit of total results.
devkey = (open('dev_key.txt','r')).read() #txt file that only has your dev key
class UnicodeWriter:
def __init__(self, f, dialect=csv.excel, encoding="utf-8-sig", **kwds):
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
url = 'https://api.adsabs.harvard.edu/v1/search/query/?q=bibgroup:cfa'
print url #printing url for troubleshooting
headers={'Authorization': 'Bearer '+devkey}
content = requests.get(url, headers=headers)
results=content.json()
k = results['response']['docs'][0]
total = results['response']['numFound']
print "Total Results: "+str(total)
timestamp = datetime.now().strftime("%Y_%m%d_%H%M")
resultFile = open("cfa_bibcodes"+timestamp+".csv",'wb')
wr = UnicodeWriter(resultFile,dialect='excel',quoting=csv.QUOTE_ALL)
#write header row
wr.writerow(['Bibcode']+['PubDate']+['Title']+['Journal']+['Volume']+['Page']+['Citations']+['URL']+['Properties']+['Refereed?'])
#how many times to loop
loop = total/200
print "Looping script "+str(loop+2)+" times."
startnum = 0
#looping a lot!
for i in range (1,loop+2):
#for i in range (1,3): #use this line instead of above for short testing
print "Results Page "+str(i)
url = 'https://api.adsabs.harvard.edu/v1/search/query/?q=bibgroup:cfa&start='+str(startnum)+'&rows=200&fl=bibcode,pubdate,aff,author,title,pub,volume,page,property,citation_count'
print url
headers = {'Authorization': 'Bearer '+devkey}
content = requests.get(url, headers=headers)
results = content.json()
docs = results['response']['docs']
for x in docs:
print x
bibcode = x['bibcode']
print bibcode
absurl = "http://adsabs.harvard.edu/abs/"+bibcode
try:
title = x['title']
titleclean = (('').join(title))
except KeyError:
titleclean = ''
try:
pub = x['pub']
except KeyError:
pub = ''
try:
pubdate = x['pubdate']
except KeyError:
pubdate = ''
try:
volume = x['volume']
except KeyError:
volume = ''
try:
page = x['page']
pageclean = "'"+('').join(page)
except KeyError:
pageclean = ''
try:
prop = x['property']
proplist = (('; ').join(prop))
if 'REFEREED' in prop:
refstat = 'Yes'
else:
refstat = 'No'
except KeyError:
proplist = ''
refstat = ''
try:
citation_count = x['citation_count']
except KeyError:
citation_count = ''
try:
year = x['year']
except KeyError:
year = ''
row = [bibcode]+[pubdate]+[titleclean]+[pub]+[volume]+[pageclean]+[str(citation_count)]+[absurl]+[proplist]+[refstat]
wr.writerow(row)
startnum += 200
time.sleep(1)
resultFile.close()
print "Finished loops through all "+str(total)+" results!"