-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
78 lines (62 loc) · 3.24 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
""" Contains utility classes.
"""
import numpy as np
__author__ = "Hunter Chasens"
__license__ = "GPLv3"
__version__ = "0.1"
__email__ = "[email protected]"
class Utils:
"""[Uninstantiated class that contains utility functions.]
"""
@staticmethod
def getPC(normdata):
"""[Returns the order principle compoenets of a dataset. Principle compoenets are order from by their eigenvalues from greatest to least.]
Args:
normdata ([nparray]): [data to be processed]
Returns:
[nparray, nparray]: [principle compoenets sorted by corresponding eigenvalues from greatest to least, ordered eigenvalues]
"""
covmrx = np.cov(normdata,rowvar=False) #creates a coverance matrix out of the normalized data
(eigvals, pc) = np.linalg.eig(covmrx) #find the eigenvalues and all unsorted principle compoenets
order = np.argsort(eigvals)[::-1] #finds the order from greatest to least of the eigenvalues
eigvals = eigvals[order] #rearranges the eigenvalues from greatest to least
pc = pc[:, order] #rearranges all principle compoenets such that their corresponding eigenvalues are order from greatest to least
return pc, eigvals
@staticmethod
def z_score(data, removeOutliers=True):
"""[normalizes an nparray by z-score (e.g. normalizes all features by standard deviation such that the standard deviation of any feature is 1)]
Args:
data ([nparray]): [the nparray to be normalized]
removeOutliers ([Boolean]): [removes outliers greater then three standard deviations]
Returns:
[nparray]: [a Z-Scored normalized nparray]
[int]: [original mean of data]
[int]: [original standard deviation of data]
"""
mean = np.mean(data, axis=0)
std = np.std(data, axis=0) # while using the optdigits.tra dataset I found (after hours of debugging) that my
#Here we are removing the row in which std is zero, meaning all data in that colum is the same. I'm sure there's a cleaner way of doing this and will research it.
mean = mean[std != 0]
data = data[:, std != 0]
std = std[std != 0] #we must do std last to preserve the original structure for the other's boolean indexing
zscore = ( data - mean ) / std # I keep getting a bug here, I think its because some std values are 0 so its trying to divide by zero
if removeOutliers == True:
zscore = zscore[np.any(zscore < 3*std, axis=1)]
return zscore, mean, std
@staticmethod
def parse(filename):
"""[Converts diffrent files into nparrays. Does not take headers. Only CSVs, TRA, and NPY files as of Alpha]
Args:
filename ([String]): [filepath to data, must include filename and extension]
Returns:
[nparray]: [the parsed datafile]
"""
arr = np.empty
if (".npy" in filename):
print("reading npy")
arr = np.load(filename)
else:
print("reading csv")
arr = np.genfromtxt(filename, delimiter=',')
arr = arr[~np.isnan(arr).any(axis=1)]
return arr