-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_regression_l1.py
117 lines (101 loc) · 4.17 KB
/
logistic_regression_l1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from nipype.interfaces.utility import Function
import nipype.pipeline.engine as pe
import nipype.interfaces.utility as niu
import numpy as np
if __name__ == '__main__':
import argparse
defstr = '(default %(default)s)'
parser = argparse.ArgumentParser()
parser.add_argument('-dd', '--datadir', type=str, help="directory for data, X and y must be in the same directory")
parser.add_argument('-x', '--xname', type=str, help='predictors')
parser.add_argument('-y', '--yname', type=str, help='response')
args=parser.parse_args()
datadir = args.datadir
xname = args.xname
yname = args.yname
imports_data = ['import os',
'import pandas as pd',
'import numpy as np']
imports_model = ['import os',
'import numpy as np',
'from sklearn import linear_model',
'from sklearn.model_selection import StratifiedKFold',
'from sklearn.metrics import roc_curve, auc, roc_auc_score']
def data(datadir, xname, yname):
X = pd.read_csv(os.path.join(datadir, xname))
y = np.genfromtxt(os.path.join(datadir, yname))
X = X.values
y[y == 1.] = 0
y[y == 2.] = 1
return X, y
Data = pe.Node(name='Data',
interface=Function(
input_names=['datadir','xname', 'yname'],
output_names=['X','y'],
function=data,
imports=imports_data
))
Data.inputs.datadir = datadir
Data.inputs.xname = xname
Data.inputs.yname = yname
def check_assume(X):
if not(np.allclose(X.mean(0), [0. for x in range(X.shape[1])])):
raise Exception("Data are not demeaned")
if not(np.allclose(X.std(0), [1. for x in range(X.shape[1])])):
raise Exception("Data do not have standard deviation of 1")
Check_Assume = pe.Node(name='Check_Assume',
interface=Function(
input_names=['X'],
output_names=[''],
function=check_assume,
imports=imports_data
))
def logistic_l1(X, y, C):
'''logistic regression, l1 penalty'''
classifier_params = {'C': [], 'coef_':[], 'intercept_':[],
'score': [], 'k':[], 'roc_auc_score': [],'n_iter_':[]}
stratkfold = StratifiedKFold(n_splits=10, shuffle=False)
model = linear_model.LogisticRegression(C=C, penalty='l1',
solver='liblinear',max_iter=1000)
for k, (train, test) in enumerate(stratkfold.split(X, y)):
model.fit(X[train], y[train])
classifier_params['C'].append(C)
classifier_params['score'].append(model.score(X[test], y[test]))
classifier_params['coef_'].append(model.coef_)
classifier_params['k'].append(k)
classifier_params['intercept_'].append(model.intercept_)
classifier_params['n_iter_'].append(model.n_iter_)
classifier_params['roc_auc_score'].append(roc_auc_score(y[test], model.predict(X[test])))
return classifier_params
LogisticL1 = pe.Node(name='LogisticL1',
interface=Function(
input_names=['X', 'y', 'C'],
output_names=['classifier_params'],
function=logistic_l1,
imports=imports_model
))
Iternode = pe.Node(niu.IdentityInterface(fields=['C']), name='Iternode')
Iternode.iterables = [('C', np.arange(0,1,.0005))]
def savefunc(classifier_params, datadir):
os.chdir(datadir)
out = pd.DataFrame(classifier_params)
if not os.path.exists(os.path.join(datadir, 'l1_out')):
os.makedirs('l1_out')
if os.getcwd() != os.path.join(datadir, 'l1_out'):
os.chdir(os.path.join(datadir, 'l1_out'))
out.to_csv('{}_logreg_model.csv'.format(classifier_params['C'][0]), index=None)
Savefunc = pe.Node(name='Savefunc',
interface=Function(
input_names=['classifier_params','datadir'],
output_names=[''],
function=savefunc,
imports=imports_data
))
Savefunc.inputs.datadir = datadir
wf = pe.Workflow(name='log_test')
#wf.connect(Data, 'X', Check_Assume, 'X')
wf.connect(Data, 'X', LogisticL1, 'X')
wf.connect(Data, 'y', LogisticL1, 'y')
wf.connect(Iternode, 'C', LogisticL1, 'C')
wf.connect(LogisticL1, 'classifier_params', Savefunc, 'classifier_params')
wf.run()