-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclinvar_searcher.py
executable file
·91 lines (64 loc) · 2.37 KB
/
clinvar_searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
"""
ClinVar Searcher
Author: Xin Wu
GrandOmics Copyright 2019
Usage:
clinvar_searcher.py [-m max] GENE
ClinVar Searcher is a script for listing all variants given a gene.
Arguments:
Gene Gene name. e.g SLC26A4
Options:
-m --max=max Max count of result for a query [default: 500]
-h --help Print help
-v --version Version
"""
from docopt import docopt
from Bio import Entrez
import time
import logging
import collections
import json
Variant = collections.namedtuple('Variant', 'accession name vtype significance assembly band start stop')
def main(gene: str):
logger.info('[Entrez] search ClinVar for {id}'.format(id=gene))
term = gene + '[Gene]'
query = Entrez.esearch(db='clinvar', term=term, retmax=500)
result = Entrez.read(query)
count = result['Count']
ids = result['IdList']
logger.info('[Count] {count}'.format(count=count))
#logger.info('[Ids] {ids}'.format(ids=ids))
for id in ids:
var = getVariant(id)
logger.info(var)
def getVariant(id: str):
# sleep 2 seconds to for throttling of NCBI Entrez
time.sleep(2)
logger.info('[Entrez] get ClinVar variant for {id}'.format(id=id))
result = Entrez.esummary(db='clinvar', id=id, retmode='json')
res = json.load(result)
vid = res['result'][id]
acc, vtype, name = vid['accession'], vid['obj_type'], vid['title']
sig = vid['clinical_significance']['description']
# all locs of the 1st var
vlocs = vid['variation_set'][0]['variation_loc']
# current loc
cvloc = None
# if only one entry, just take it
if len(vlocs) == 1:
cvloc = vlocs[0]
# if more than 1 record, there must be a 'current' entry
else:
cvloc = next(l for l in vlocs if l['status'] == 'current')
assembly, band, start, stop = cvloc['assembly_name'], cvloc['band'], cvloc['start'], cvloc['stop']
variant = Variant(accession=acc, name=name, vtype=vtype, significance=sig, assembly=assembly, band=band, start=start, stop=stop)
return variant
if __name__ == '__main__':
args = docopt(__doc__, version='clinvar_searcher version 0.1')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('clinvar')
logger.info(args)
gene = args['GENE']
Entrez.email = '[email protected]'
main(gene)