-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentrez-retriever.py
98 lines (86 loc) · 5.24 KB
/
entrez-retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
'''
Entrez Retriever - Use the Entrez API to perform NCBI queries for sequences
Copyright (C) 2024 fonors, goncalof21, MadalenaFranco2 & scmdcunha
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
import argparse, requests, xml.etree.ElementTree as ET
parser = argparse.ArgumentParser(
prog = 'Entrez Retriever',
description = 'Use the Entrez API to perform NCBI queries for sequences.')
def search_args(argparser):
"""
Gets the arguments the user defined when running the script and returns them.
"""
argparser.add_argument("-db", "--database", help="Database to search in.")
argparser.add_argument("-t", "--term", help="Entrez text query.")
argparser.add_argument("-w", "--webenv", help="Web environment string returned from a previous ESearch, EPost or ELink call.")
argparser.add_argument("-q", "--querykey", help="Integer query key returned by a previous ESearch, EPost or ELink call.")
argparser.add_argument("-Rs", "--retstart", help="Sequential index of the first UID in the retrieved set to be shown in the XML output.")
argparser.add_argument("-Rm", "--retmax", help="Total number of UIDs from the retrieved set to be shown in the XML output.")
argparser.add_argument("-Rt", "--rettype", help="Retrieval type. There are two allowed values for ESearch: 'uilist' (default), which displays the standard XML output, and 'count', which displays only the <Count> tag.")
argparser.add_argument("-Rmd", "--retmode", help="Retrieval type. Determines the format of the returned output. The default value is 'xml' for ESearch XML, but 'json' is also supported to return output in JSON format.")
argparser.add_argument("-s", "--sort", help="Specifies the method used to sort UIDs in the ESearch output.")
argparser.add_argument("-f", "--field", help="Search field. If used, the entire search term will be limited to the specified Entrez field.")
argparser.add_argument("-i", "--idtype", help="Specifies the type of identifier to return for sequence databases (nuccore, popset, protein).")
argparser.add_argument("-dt", "--datetype", help="Type of date used to limit a search.")
argparser.add_argument("-rd", "--reldate", help="When reldate is set to an integer n, the search returns only those items that have a date specified by datetype within the last n days.")
argparser.add_argument("-md", "--mindate", help="Minimum date used to limit a search result by the date specified by datetype. Must be used together with maxdate to specify an arbitrary date range.")
argparser.add_argument("-Md", "--maxdate", help="Maximum date used to limit a search result by the date specified by datetype. Must be used together with mindate to specify an arbitrary date range.")
search_args = argparser.parse_args()
return search_args
def search_query(search_args):
"""
Performs a search query in the NCBI databases. Receives the database name and the search term, along with any additional parameters.
Uses the 'history' feature.
"""
search_params = {
'db': search_args.database,
'term': search_args.term,
'retstart': search_args.retstart,
'retmax': search_args.retmax,
'rettype': search_args.rettype,
'retmode': search_args.retmode,
'sort': search_args.sort,
'field': search_args.field,
'idtype': search_args.idtype,
'datetype': search_args.datetype,
'reldate': search_args.reldate,
'mindate': search_args.mindate,
'maxdate': search_args.maxdate,
'query_key': search_args.querykey,
'WebEnv': search_args.webenv,
'usehistory': 'y'
}
server_request = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi/', params=search_params)
return server_request
def seq_fetcher(query_results, og_search_args):
"""
Uses the search_query() function to get the desired sequences, then it uses the 'history' feature to parse all the desired sequences and
returns them in FASTA format.
Outputs to STDOUT.
"""
query_xml_elemtree = ET.fromstring(query_results.text)
query_key = query_xml_elemtree.find("QueryKey").text
webenv = query_xml_elemtree.find("WebEnv").text
fetch_params = {
'db': og_search_args.database,
'rettype': 'fasta',
'query_key': query_key,
'WebEnv': webenv
}
server_request = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=fetch_params)
print(server_request.text)
if __name__ == "__main__":
user_args = search_args(parser)
server_info = search_query(user_args)
seq_fetcher(server_info, user_args)