-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspeak_cnn.py
46 lines (40 loc) · 1.51 KB
/
speak_cnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from story_teller import *
import string
import codecs
import random
# use the same file name as the rcv1 to minimize my modification
datafile = 'rcv1-1m-train.txt.tok'
datalabel = 'rcv1-1m-train.lvl2'
dictfile = 'rcv1-lvl2.catdic'
testfile = 'rcv1-1m-test.txt.tok'
testlabel = 'rcv1-1m-test.lvl2'
def speakFile(datacontainer, dataTarget, labelTarget):
sequence = range(len(datacontainer['data']))
random.shuffle(sequence)
dfile = codecs.open(dataTarget, "w", "utf-8")
lfile = codecs.open(labelTarget, 'w', 'utf-8')
for idx in sequence:
content = removeNewlines(datacontainer['data'][idx])
label = datacontainer['targets'][idx]
dfile.write(content+"\n")
lfile.write(label+"\n")
dfile.close()
lfile.close()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def removeNewlines(content):
# remove new lines, ^M, punctuation, {}, all that jazz
# going to ascii to get more punc removal
cleaned = content.translate(remove_punctuation_map)
asci = cleaned.encode("ascii", "ignore")
altered = " ".join([k.strip() for k in asci.split('\n')]);
return altered.decode("utf-8")
# start with 100 and 10, we will increase the size at the end
sLabels = ('0To15', '150Plus');
training_data = trainingData(2000, sLabels)
test_data = testingData(200, sLabels)
speakFile(training_data, datafile, datalabel)
speakFile(test_data, testfile, testlabel)
labelFH = codecs.open(dictfile, "w", 'utf-8')
for l in sLabels:
labelFH.write(l+"\n")
labelFH.close()