-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataManipulationFuncs.py
106 lines (66 loc) · 2.32 KB
/
dataManipulationFuncs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dataCleaning.classificationData import getWineData
from dataCleaning.regressionData import getSsinData, getEducationData, getBasicRegData
"""
Functions to perform Data Manipulation on Pandas Dataframes
Methods:
oneHotEncoding(df)
-performs one hot encoding on a Pandas df/series
-changes collumns with dtype categorical
enocodeDiscreteDatWithinUnKnowns(dfFull, df, colName, numHigh, numLow=0, iterator=1)
-does a one hot encoding for discrete Data with unfilled Data
"""
def oneHotEncoding(df):
df= pd.get_dummies(df)
return df
def imputeValues(df):
df=df.fillna(df.mean())
return df
def concatDFHorizantaly(dfList):
#concate all horizantally
fullDf= dfList[0]
dfList.pop(0)
print("\n testing shape concat")
for df in dfList:
# print(fullDf.shape)
# print(df.shape)
fixMissingIndexVals(df)
# fullDf= df.drop_duplicates(inplace=True)
# df.drop_duplicates(inplace=True)
fullDf = pd.concat([fullDf, df], axis=1, ignore_index=False)
return fullDf
def fixMissingIndexVals(df):
indexVals= df.index.values
NaNBool=pd.isnull(indexVals)
NaNcount=np.sum(pd.isnull(indexVals))
#make new index names
indexNamesForNaN=[]
prefix= "UNAMED COMPANY "
for i in range(0, NaNcount):
newName= prefix+str(i+1)
indexNamesForNaN.append(newName)
#fix unamed indexes
df[NaNBool]= df[NaNBool].set_index(indexNamesForNaN)
print("####mising values filled")
fixMissingIndexVals(df)
def enocodeDiscreteDatWithinUnKnowns(dfFull, df, colName, numHigh, numLow=0, iterator=1):
#make sure are integer values
isInt= (df.dtypes == 'int64')
if not isInt:
print("\nError: Convert Series Data in '"+colName+"+' to int64 first")
return
#create category list of strings
intList= np.arange(numLow, numHigh, iterator)
#add new collumns
numNewRows= (numHigh-numLow)
for x, val in np.ndenumerate(intList):
newColName= colName+"_"+str(val)
seriesData= [0]* numNewRows
#add postive encoding
seriesData[val]=1
dfFull[newColName]= pd.Series(seriesData)
#drop original disrctor coll
dfFull = dfFull.drop(colName, 1)
return dfFull