-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtidyCSVtables.py
201 lines (156 loc) · 5.25 KB
/
tidyCSVtables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import pandas as pd
import re
import glob
from math import nan
from makeDataFunctions.makeUniqueIds import getIds
"""
Merging Data from the Database into one Dataframe
NOT GLOBBING ANYMORE AND SINCE TOOK TOO LONG
"""
# def mergePerson():
#
# #combine the Person Data using Glob
# path = 'dbCSVdata/Person' # use your path
# allFiles = glob.glob(path + "/*.csv")
# personDf = pd.DataFrame()
# dfs = []
# for file_ in allFiles:
# df = pd.read_csv(file_, index_col=None, header=0)
# dfs.append(df)
# personDf = pd.concat(dfs)
#
# personDf.to_csv('fullCSVtables/Person')
#
# def mergeCustomer():
#
# #combine the Person Data using Glob
# path = 'dbCSVdata/Customer' # use your path
# allFiles = glob.glob(path + "/*.csv")
# personDf = pd.DataFrame()
# dfs = []
# for file_ in allFiles:
# df = pd.read_csv(file_, index_col=None, header=0)
# dfs.append(df)
# personDf = pd.concat(dfs)
#
# personDf.to_csv('fullCSVtables/Customer')
#
# def mergeCustomerRep():
#
# #combine the Person Data using Glob
# path = 'dbCSVdata/CustomerRep' # use your path
# allFiles = glob.glob(path + "/*.csv")
# personDf = pd.DataFrame()
# dfs = []
# for file_ in allFiles:
# df = pd.read_csv(file_, index_col=None, header=0)
# dfs.append(df)
# personDf = pd.concat(dfs)
#
# personDf.to_csv('fullCSVtables/CustomerRep')
"""Get all types of words"""
negWords = getIds('storedData/words/negativeWords.csv', False)
posWords = getIds('storedData/words/positiveWords.csv', False)
stopWords = getIds('storedData/words/stopWords.csv', False)
customerFile= 'fullCSVtables/testData/Customer.csv'
CustRepFile= 'fullCSVtables/testData/CustomerRep.csv'
personFile= 'fullCSVtables/testData/Person.csv'
textFile= 'fullCSVtables/testData/Text.csv'
conversationFile= 'fullCSVtables/testData/Conversation.csv'
def tidyTextTable():
textDf= pd.read_csv(textFile)
#add a column for each positive and negative word
allWords= negWords+posWords
for word in allWords:
textDf[word]=0
#add count cols to the Df
for wordCol in allWords:
textDf[wordCol]= 0
#count the words
textDf= textDf.apply(countWords, axis=1)
textDf=textDf.drop('TEXT', axis=1)
#melt the word count columns
# melt platform count for Person
id_vars = ['idText', 'idConversation', 'personId']
textDf = pd.melt(textDf, id_vars=id_vars, value_vars=allWords,
var_name='KEYWORD', value_name='WORD_COUNT')
textDf.info()
# save tidy CSV and return
textDf.to_csv('tidyTables/Text.csv', index=False)
return textDf
def countWords(textDf):
allWords = negWords + posWords
#get the text for this row
textData= textDf.TEXT
textData=textData[1:].split(' ')
#count the no stop words
i=0
wordCountDict = {}
for word in allWords:
wordCountDict[word]=0
i+=1
for word in textData:
if word in allWords:
wordCountDict[word]= wordCountDict[word]+1
#set the count for the new df
for word in allWords:
count= wordCountDict[word]
textDf[word]= count
return textDf
def tidyConvoTable():
convoDf = pd.read_csv(conversationFile)
#make one bool Col for is_call instead of text too
convoDf = convoDf.drop(labels='IS_TEXT_CHAT', axis=1)
#bool-> binary encoding
convoDf['IS_CALL']= convoDf['IS_CALL'].astype(int)
#save tidy CSV and return
convoDf.to_csv('tidyTables/Conversation.csv', index=False)
return convoDf
def tidyPersonTable():
personDf= pd.read_csv(personFile)
# normalize phone number
personDf['AREA_CODE'] = personDf['PHONE NUMBER'].apply(getAreaCode)
personDf['PHONE NUMBER'] = personDf['PHONE NUMBER'].apply(normalizePhone)
#melt platform count for Person
id_vars= ['idPerson' ,'FIRST_NAME' ,'MIDDLE INITIAL', 'LAST_NAME','UTC_TIMEZONE','EMAIL', 'AREA_CODE', 'PHONE NUMBER']
personDf=personDf.rename( columns={'PLATFORM_A_CONVO_COUNT':'A', 'PLATFORM_B_CONVO_COUNT':'B', 'PLATFORM_C_CONVO_COUNT':'C'})
personDf= pd.melt(personDf, id_vars=id_vars, value_vars=['A','B','C'],
var_name='PLATFORM', value_name='CONVO_COUNT')
#gsave tidy table and return
personDf.to_csv('tidyTables/Person.csv', index=False)
return personDf
#get rid of non digits in phone num
def normalizePhone(phoneNum):
normalizeNum= re.sub('[^0-9]', '', phoneNum)
length= len(normalizeNum)
#return NaN if incomplete phone number
if length<10:
return nan
#find are code if has one
elif length>10:
endIndx= length- 10
phoneNum= normalizeNum[endIndx:]
#get the normalized phoen number
else:
phoneNum=normalizeNum
return phoneNum
#get the area code in the Phone Collumn
def getAreaCode(phoneNum):
normalizeNum= re.sub('[^0-9]', '', phoneNum)
length= len(normalizeNum)
#return NaN if incomplete phone number
if length<=10:
return nan
#find are code if has one
else:
if length==12:
endIndx= 2
else:
endIndx=1
areacode= normalizeNum[0:endIndx]
return areacode
def tidyAllTable():
tidyPersonTable()
tidyConvoTable()
tidyTextTable()
#tidyAllTable()