-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathGetExpLevelsPerGene.py
executable file
·187 lines (171 loc) · 6.36 KB
/
GetExpLevelsPerGene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#! /usr/bin/env python
## This script sums all read counts from exons per gene and computes their RPKM.
## Author: Timothee Flutre
import sys
import os
import getopt
import scipy
class Gene(object):
def __init__(self, n="", c="", s=0, e=0):
self.name = n
self.chr = c
self.start = int(s)
self.end = int(e)
self.sumExonLengths = self.end - self.start
self.aReadCounts = None
def __repr__(self):
return self.name, self.chr, self.start, self.end, self.sumExonLengths
def getRpkm(self, aAllReadCounts):
rpkms = scipy.zeros(len(aAllReadCounts))
idx = (aAllReadCounts != 0)
rpkms[idx] = 10**9 * self.aReadCounts[idx] / (aAllReadCounts[idx] \
* self.sumExonLengths)
return rpkms
def read(self, inH):
line = inH.readline()
if line == "":
return
lTokens = line.rstrip().split()
self.name = lTokens[3]
self.chr = lTokens[0]
self.start = int(lTokens[1])
self.end = int(lTokens[2])
self.sumExonLengths = self.end - self.start
self.aReadCounts = scipy.array(map(int, lTokens[4:]))
while True:
prevPos = inH.tell()
line = inH.readline()
if line == "":
break
lTokens = line.rstrip().split()
if lTokens[3] != self.name:
inH.seek(prevPos)
break
if int(lTokens[1]) < self.start:
self.start = int(lTokens[1])
if int(lTokens[2]) > self.end:
self.end = int(lTokens[2])
self.sumExonLengths += int(lTokens[2]) - int(lTokens[1])
self.aReadCounts += scipy.array(map(int, lTokens[4:]))
def write(self, outH, expLevels, aAllReadReadCounts):
txt = "%s" % self.chr
txt += " %i" % self.start
txt += " %i" % self.end
txt += " %s" % self.name
if expLevels == "rpkm":
rpkms = self.getRpkm(aAllReadReadCounts)
for i in xrange(0,len(rpkms)):
txt += " %.2f" % rpkms[i]
else:
for i in xrange(0,len(self.aReadCounts)):
txt += " %i" % self.aReadCounts[i]
outH.write("%s\n" % txt)
def help():
msg = "`%s' sums all read counts from exons per gene and computes their RPKM.\n" % os.path.basename(sys.argv[0])
msg += "RPKM = 10^9 * C / ( N * L )\n"
msg += "C=read count for transcript, N=total read count, L=sum of exon lengths in bp\n"
msg += "\n"
msg += "Usage: %s [OPTIONS] ...\n" % os.path.basename(sys.argv[0])
msg += "\n"
msg += "Options:\n"
msg += " -h, --help\tdisplay the help and exit\n"
msg += " -v, --verbose\tverbosity level (default=1)\n"
msg += " -i, --input\tinput file in BED-like format\n"
msg += "\t\tcolumns: chr exonStart exonEnd geneName readCountSample1, readCountSample2...\n"
msg += "\t\tfile should be sorted, use sort -k1,1V -k4,4V -k2,2n -k3,3n\n"
msg += "\t\tshould have a header line with the identifiers of the samples\n"
msg += " -o, --output\toutput file in a similar format\n"
msg += "\t\tcolumns: chr most5'exon most3'exon geneName RpkmSample1 RpkmSample2...\n"
msg += " -e, --exp\texpression levels as 'rpkm' (default) or 'readcount'\n"
msg += "\n"
msg += "Example:\n"
msg += " %s -i read_counts_per_exon.txt -o rpkm_per_gene.txt" % os.path.basename(sys.argv[0])
print msg; sys.stdout.flush()
def setParamsFromCmdLine():
inFile = ""
outFile = ""
expLevels = "rpkm"
verbose = 1
try:
opts, args = getopt.getopt(sys.argv[1:], "hv:i:o:e:",
["help", "verbose=", "input=",
"output=", "exp="])
except getopt.GetoptError, err:
sys.stderr.write("%s\n" % str(err))
help()
sys.exit(2)
for o, a in opts:
if o in ("-h", "--help"):
help()
sys.exit(0)
elif o in ("-v", "--verbose"):
verbose = int(a)
elif o in ("-i", "--input"):
inFile = a
elif o in ("-o", "--output"):
outFile = a
elif o in ("-e", "--exp"):
expLevels = a
if inFile == "":
msg = "ERROR: missing input file (-i)"
sys.stderr.write("%s\n\n" % msg)
help()
sys.exit(1)
if outFile == "":
msg = "ERROR: missing output file (-o)"
sys.stderr.write("%s\n\n" % msg)
help()
sys.exit(1)
if expLevels not in ("rpkm", "readcount"):
msg = "ERROR: unknown expression level (-e)"
sys.stderr.write("%s\n\n" % msg)
help()
sys.exit(1)
return inFile, outFile, expLevels, verbose
def readAllGenes(inFile, verbose):
if verbose > 0:
print "load file '%s'..." % inFile
sys.stdout.flush()
lSampleNames = []
lGenes = []
aAllReadCounts = None
inH = open(inFile)
line = inH.readline()
lTokens = line.rstrip().split()
lSampleNames = lTokens[4:]
aAllReadCounts = scipy.zeros(len(lSampleNames))
lGeneNames = []
while True:
iGene = Gene()
iGene.read(inH)
if iGene.name == "":
break
if iGene.name in lGeneNames:
msg = "ERROR: the input file is not sorted, see the help"
sys.stderr.write("%s\n" % msg)
sys.exit(1)
aAllReadCounts += iGene.aReadCounts
lGenes.append(iGene)
lGeneNames.append(iGene.name)
inH.close()
if verbose > 0:
print "nb of genes: %i" % len(lGenes)
print "nb of samples: %i" % len(aAllReadCounts)
print "nb of reads: %i" % aAllReadCounts.sum()
sys.stdout.flush()
return lSampleNames, lGenes, aAllReadCounts
def writeAllGenes(lSampleNames, lGenes, aAllReadCounts, expLevels, outFile, verbose):
outH = open(outFile, "w")
txt = "chr start end gene"
for sample in lSampleNames:
txt += " %s" % sample
outH.write("%s\n" % txt)
for iGene in lGenes:
iGene.write(outH, expLevels, aAllReadCounts)
outH.close()
def main():
inFile, outFile, expLevels, verbose = setParamsFromCmdLine()
lSampleNames, dGenes, aAllReadCounts = readAllGenes(inFile, verbose)
writeAllGenes(lSampleNames, dGenes, aAllReadCounts, expLevels, outFile, verbose)
if __name__ == "__main__":
main()