-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_new_Grants.py
136 lines (102 loc) · 3.96 KB
/
search_new_Grants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
Written in Python 3.5
The purpose of this script is to search PubMed for publications that were funded by MIDAS grants."""
import requests
import xml.etree.ElementTree as ET
import datetime as dt
from Bio import Entrez
import os
import time
path1=os.path.dirname(os.path.abspath(__file__))
def search(query):
"""Search PubMed by grant number and return primary IDs."""
Entrez.email = '[email protected]'
handle = Entrez.esearch(db='pubmed',
term=query,
retmax=1000,
)
time.sleep(10)
results = Entrez.read(handle)
return results
def fetch_doc_sum(id_list):
"""Search PubMed by primary IDs to obtain DocSums for each publication."""
ids = ','.join(id_list)
Entrez.email = '[email protected]'
handle = Entrez.efetch(db='pubmed',
id=ids,
retmode='xml',
version='2.0')
records = handle.read()
return records
def checkXML(XML, path):
"""Looks for path to XML tag and pulls out the text therein."""
if XML.find(path) is not None:
if XML.find(path).text is not None:
return XML.find(path).text
else:
return ""
else:
return ""
def parse_authors(authors):
"""Parses out authors' last names and initials of first/middle names."""
authorList = list()
for author in authors:
authorList.append(checkXML(author, './LastName') + " "
+ checkXML(author, './Initials'))
return authorList
def parse_grants(grants):
"""Parses grant numbers from DocSum XML."""
grant_list = list()
for grant in grants:
grant_list.append(checkXML(grant, './GrantID'))
return grant_list
def extract_info(pub_med_XML):
"""Not currently used.
Extracts article title, author list, journal, date of publication, DOI, and PMID from the DocSum XML."""
tree = ET.fromstring(pub_med_XML)
articles = tree.findall('./PubmedArticle/MedlineCitation')
from collections import defaultdict
paper = defaultdict(list)
for article in articles:
paper['TITLE'] = article.find('./Article/ArticleTitle').text
paper['AUTHORS'] = parse_authors(article.findall('./Article/AuthorList/Author'))
paper['PMID'] = article.find('./PMID').text
paper['DOI'] = checkXML(article, './Article/ELocationID')
paper['GRANT'] = parse_grants(article.findall('./Article/GrantList/Grant'))
#for l in parse_grants(article.findall('./Article/GrantList/Grant')):
# paper[article.find('./PMID').text].append(l.replace(" ",''))
paper['JOURNAL'] = article.find('./Article/Journal/ISOAbbreviation').text
papers.append(paper)
return paper
obc_api_url = "http://devapi.onbc.io/publications"
# Executes functions
if __name__ == '__main__':
import sys
filename= sys.argv[1]
grants_in = open(filename, "r")
id_list=[]
all_ids = set()
for line in grants_in:
line = line.splitlines()
time.sleep(1)
results = search(line)
id_list = results['IdList']
for id in id_list:
all_ids.add(id)
grants_in.close()
pmid_lst = list()
in_obc = requests.get(obc_api_url).json()
for publication in in_obc:
pmid_lst.append(publication.get("pmid"))
new_pub_ids = []
for id in all_ids:
if id not in pmid_lst:
new_pub_ids.append(id)
else : continue
papers = fetch_doc_sum(new_pub_ids)
info = extract_info(papers)
today = dt.datetime.today().strftime("%Y-%m-%d")
fname = "most_recent_publications_{}.xml".format(today)
with open(fname, "a",encoding="utf-8") as xml_out:
xml_out.write(papers)
print("Number of new publications: ", len(new_pub_ids))