-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
133 lines (78 loc) · 3.5 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def normalize (x, type = 'min_max_normalization') :
'''
Performs normalization on the given vector according to the type of normalization specified.
The type of normalization may be 'standardization' or 'min_max_normalization'.
Defaults to 'min_max_normalization'.
Arguments :
x : the vector to be normalized
type : the type of normalization to be performed (default = 'min_max_normalization')
Returns :
normalized vector x
'''
if type == 'standardization' :
return ((x - np.mean(x)) / np.std(x))
else :
return ((x - np.min(x)) / (np.max(x) - np.min(x)))
def train_test_val_split (x1, x2, Y) :
'''
Performs a train : test : cross-validation split in the ratio 0.8 : 0.1 : 0.1 on the given data.
Arguments
-----------
x1 : vector representing latitudes in data
x2 : vector representing longitudes in data
Y : vector representing the target variable altitude in data
Returns
---------
x1_Train, x1_Test, x1_Val,
x2_Train, x2_Test, x2_Val,
Y_Train, Y_Test, Y_Val,
in the sizes (80%, 10%, 10%) for (Train, Test, Val) respectively as described above.
'''
x1 = np.reshape(x1, (x1.shape[0],1))
x2 = np.reshape(x2, (x2.shape[0],1))
X = np.concatenate((x1, x2), axis = 1)
Y = np.reshape(Y, (Y.shape[0],1))
X_Train, X_Test, Y_Train, Y_Test = train_test_split (X, Y, test_size = 0.2, random_state = 0)
X_Test, X_Val, Y_Test, Y_Val = train_test_split (X_Test, Y_Test, test_size = 0.5, random_state = 3)
x1_Train = np.reshape( X_Train [:,0], (X_Train [:,0].shape[0],1))
x1_Test = np.reshape( X_Test [:,0], (X_Test [:,0].shape[0],1))
x1_Val = np.reshape( X_Val [:,0], (X_Val [:,0].shape[0],1))
x2_Train = np.reshape( X_Train [:,1], (X_Train [:,1].shape[0],1))
x2_Test = np.reshape( X_Test [:,1], (X_Test [:,1].shape[0],1))
x2_Val = np.reshape( X_Val [:,1], (X_Val [:,1].shape[0],1))
return x1_Train, x1_Test, x1_Val, x2_Train, x2_Test, x2_Val, Y_Train, Y_Test, Y_Val
def generate_feature_matrix (x1, x2, degree) :
'''
Returns the feature matrix for any polynomial degree, when the number of variables is two.
For example, the feature matrix constructed for fitting an 'n-degree' polynomial of two variables (x1 and x2)
to the data would have each column as each of the features listed below:
1, x1, x2, x1^2, x1*x2, x2^2, ..., x1^n, x1^(n-1)*x2, ..., x1*x2^(n-1), x2^n
The feature matrix would look like:
[[1 x1(1) x2(1) x1(1)^2 x1(1) * x2(1) ... x2(1)^n],
[1 x1(2) x2(2) x1(2)^2 x1(2) * x2(2) ... x2(2)^n],
[1 x1(3) x2(3) x1(3)^2 x1(3) * x2(3) ... x2(3)^n],
...
[1 x1(N) x2(N) x1(N)^2 x1(N) * x2(N) ... x2(N)^n]]
Hence, the total number of features for a polynomial of degree D in two variables is given by :
(D+1) * (D+2) / 2
Arguments
-----------
x1 : vector representing latitudes in data
x2 : vector representing longitudes in data
degree : the desired degree of polynomial that we wish to fit to the data
Returns
---------
featureMatrix : for the specified degree, as elaborated above
'''
X = np.concatenate((x1,x2), axis = 1)
featureMatrix = np.ones((X.shape[0],1))
for d in range(1,degree+1):
for i in range(d+1):
feature = np.multiply((x1**(d-i)),(x2**(i)))
feature = np.reshape(feature,(feature.shape[0],1))
featureMatrix = np.concatenate((featureMatrix,feature),axis=1)
featureMatrix = np.asarray(featureMatrix)
return featureMatrix