-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathExtractGenes.py
88 lines (82 loc) · 3.38 KB
/
ExtractGenes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# extract gene names
# python ExtractGenes.py -f vep.deleterious.vep.txt -o vep.genes.tsv -goa goa_human.gaf -go go.obo
import argparse
def getGoa(goaFile):
dict = {}
with open(goaFile, 'r') as input:
for row in input:
if not row.startswith('!'):
cols = row.strip('\n').split('\t')
if cols[2] in dict:
dict[cols[2]].append(cols[4])
else:
dict[cols[2]] = [cols[4]]
#print('goa', cols[2], cols[4])
return dict
def getGo(goFile):
dict = {}
with open(goFile, 'r') as input:
for row in input:
if row.startswith('[Term]'):
idName = None
else:
if row.startswith('id: GO:'):
idName = 'GO:' + row.strip('\n').split(': GO:')[1]
else:
if row.startswith('def:') and idName != None:
dict[idName] = row.strip('\n').split(':')[1].lstrip(' ').lstrip('"').rstrip('" [GOC')
#print('go', idName, dict[idName])
idName = None
return dict
def writeOutput(dictGenes, dictGenesFreq, outputFileName):
output = open(outputFileName, 'w')
for gene in dictGenes:
output.write(gene + '\t' + str(dictGenesFreq[gene]) + '\t' + dictGenes[gene] + '\n')
output.close()
def getGenesDesc(vepFileName, dictGoa, dictGo, outputFileName):
dictGenes = {}
dictGenesFreq = {}
countNotGoa = 0
countGoa = 0
countNotGo = 0
countGo = 0
with open(vepFileName, 'r') as input:
for row in input:
cols = row.strip('\n').split('\t')
extraCol = cols[13].split(';')
for eCol in extraCol:
if eCol.startswith('SYMBOL='):
gene = eCol.split('=')
if gene[1] in dictGoa:
if gene[1] in dictGenesFreq:
dictGenesFreq[gene[1]] += 1
else:
for goId in dictGoa[gene[1]]:
if goId in dictGo:
dictGenes[gene[1]] = dictGo[goId]
dictGenesFreq[gene[1]] = 1
countGo += 1
else:
countNotGo += 1
countGoa += 1
#print('gene', gene[1], dictGenes[gene[1]])
else:
countNotGoa += 1
print('count goa/not goa/go:', countGoa, countNotGoa)
print('count go/not go:', countGo, countNotGo)
print('genes found:', len(dictGenes))
return dictGenes, dictGenesFreq
def main(vepFileName, goaFile, goFile, outputFileName):
dictGoa = getGoa(goaFile)
dictGo = getGo(goFile)
print('dict goa/go:', len(dictGoa), len(dictGo))
dictGenes, dictGenesFreq = getGenesDesc(vepFileName, dictGoa, dictGo, outputFileName)
writeOutput(dictGenes, dictGenesFreq, outputFileName)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', help='vep file')
parser.add_argument('-goa', help='go annotation file')
parser.add_argument('-go', help='go obo file')
parser.add_argument('-o', help='output file')
args = parser.parse_args()
main(args.f, args.goa, args.go, args.o)