-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_calculator.py
228 lines (171 loc) · 5.6 KB
/
feature_calculator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
'''
calculates features for a given file
usage: python feature_calculator.py <file_with_clean_data> [output_file]
'''
# for python 2 backward compatibility
from __future__ import division
import sys
import os
import pandas as pd
import re
import code
def usage_str():
'''
usage string
'''
print("usage: python feature_calculator.py <file_with_clean_data> [output_file]")
def decorator(n):
'''
'''
print("*"*n)
class FeatureCalculator:
def __init__(self, source_file):
self.source_file = source_file
self.data = pd.read_csv(source_file, skip_blank_lines=True)
self.set_sequence_column()
self.data.index.name = 'seq_no'
self.non_feature_columns = self.data.columns.tolist()
def set_sequence_column(self, seq="seq"):
self.SEQ = seq
def feature_columns(self):
return list(set(self.data.columns.tolist()) - set(self.non_feature_columns))
def save_features(self, columns=[], output_file=""):
'''
exports all data with features to a new file
'''
# drop useless columns
drop_cols = []
for col in self.feature_columns():
if len(self.data[col].unique()) == 1:
drop_cols.append(col)
self.data.drop(columns=drop_cols, inplace=True)
if not columns:
columns = self.data.columns.tolist()
columns = list(set(columns) - set(drop_cols))
if output_file == "":
output_file = self.source_file + ".feature"
self.data.to_csv(output_file, sep='\t', columns=columns)
print("saving features: "+str(columns))
print("Saved to " + output_file)
def gc_content(self):
'''
feature: adds gc_content attribute
'''
print ("calculating GC content")
def calc(seq):
g = seq.count('G')
c = seq.count('C')
return ( (g+c) / len(seq) ) * 100
self.data['gc_content'] = self.data[self.SEQ].apply(calc)
return self.data
def tataaa_box_present(self):
'''
feature: adds tataaa box attribute
'''
print ("calculating TATAAA box")
def calc(seq):
return int('TATAA' in seq)
self.data['tataaa'] = self.data[self.SEQ].apply(calc)
return self.data
def gc_box(self):
'''
feature: adds gc box attribute: CCAAT and GGGCGG
'''
print ("calculating GC box")
def calc(seq):
gc = ['CCAAT', 'GGGCGG']
return int(any(s in gc for s in seq))
self.data['gc_box'] = self.data[self.SEQ].apply(calc)
return self.data
def poly_a_tail(self, n=3):
'''
3 or more As in last 36 nucleotide
'''
print ("calculating poly A tail")
def calc(seq):
if len(seq) < 36:
return 0
seq = seq[-36:]
pattern = 'A{'+str(n)+'"'
matches = re.findall(pattern, seq)
return len(matches)
self.data['poly_a'] = self.data[self.SEQ].apply(calc)
return self.data
def stop_codon_present(self):
'''
Checks for stop codons TAA TGA TAG
'''
print ("calculating stop codons")
def calc(seq):
STOP = ["TAA", "TGA", "TAG"]
# check in any of the stop codons are in the seq
return int(any(sc in STOP for sc in seq))
self.data['stop_codon_present'] = self.data[self.SEQ].apply(calc)
return self.data
def sequence_length(self):
'''
calculates length of the seq
'''
print("calcualting sequence length")
def calc(seq):
return len(seq)
self.data['seq_length'] = self.data[self.SEQ].apply(calc)
return self.data
def start_codon(self):
'''
Checks for start codons TAC
'''
print("checking for start codon")
def calc(seq):
return int("TAC" in seq)
self.data['start_codon'] = self.data[self.SEQ].apply(calc)
return self.data
def kozak(self):
'''
Checks for kozak consensus sequence ACCAUGG
'''
print("checking for kozak consensus sequence")
def calc(seq):
return int("ACCAUGG" in seq)
self.data['kozak'] = self.data[self.SEQ].apply(calc)
return self.data
def feature_template(self):
'''
feature: adds new attribute
'''
print("calculating ... ? ")
def calc(seq):
# CHANGE THIS
# val = calculate feature value on seq
val = 0
return val
self.data['CHANGE_THIS_to_your_feature_name'] = self.data[self.SEQ].apply(calc)
return self.data
## starts here
decorator(25)
if(len(sys.argv) < 2):
usage_str()
else:
feature_calculator = FeatureCalculator(sys.argv[1])
# # interactive run
# decorator(25)
# print("`feature_calculator` object created, you can now interact with it")
# decorator(25)
# code.interact(local=dict(globals(), **locals()))
# calculate features
feature_calculator.gc_content()
feature_calculator.tataaa_box_present()
feature_calculator.gc_box()
feature_calculator.poly_a_tail()
feature_calculator.stop_codon_present()
feature_calculator.start_codon()
feature_calculator.sequence_length()
feature_calculator.kozak()
# save features to file
if(len(sys.argv)==3):
output_file = sys.argv[2]
else:
output_file = ""
feature_calculator.save_features(feature_calculator.feature_columns(), output_file)
print("DONE")
decorator(25)