Skip to content

Commit

Permalink
clearing inconsistencies and creating efficient data pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
abtExp committed Mar 19, 2020
1 parent 2a7921a commit c52c958
Show file tree
Hide file tree
Showing 11 changed files with 2,663 additions and 46 deletions.
6 changes: 3 additions & 3 deletions assign/models/layers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import keras.backend as K
from keras import initializers
import tensorflow.keras.backend as K
from tensorflow.keras import initializers

from keras.layers import Layer
from tensorflow.keras.layers import Layer
import numpy as np

class Attention(Layer):
Expand Down
26 changes: 19 additions & 7 deletions assign/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from ..utils.data_loader import DATA_LOADER
from ..utils.data_feeder import data_loader

from keras.layers import Input, Dense, Dropout, CuDNNLSTM, Bidirectional,\
from tensorflow.keras.layers import Input, Dense, Dropout, CuDNNLSTM, Bidirectional,\
Concatenate, Reshape, GlobalAveragePooling2D, ConvLSTM2D, GlobalAveragePooling1D
from keras.models import Model
from keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

import tensorflow as tf
import keras
Expand All @@ -23,8 +23,8 @@ class MODEL(BASE):
def __init__(self, vars, model='assign'):
self.model_name = model
self.inp_shape = (vars.MAX_SENTENCES,)
self.speech_encoder = SPEECH_ENCODER(vars).model
self.text_encoder = TEXT_ENCODER(vars).model
# self.speech_encoder = SPEECH_ENCODER(vars).model
# self.text_encoder = TEXT_ENCODER(vars).model
self.load_mode = 'both'

super(MODEL, self).__init__(vars)
Expand Down Expand Up @@ -73,11 +73,23 @@ def compose_model(self):
layer = Bidirectional(CuDNNLSTM(128, return_sequences=True))(features)
layer = GlobalAveragePooling1D()(layer)
layer = Dense(256, activation='relu')(layer)
layer = Dropout(0.3)(layer)
# layer = Dropout(0.3)(layer)
layer = Dense(self.vars.NUM_DAYS_PRED)(layer)

model = Model(inputs=[text, speech], outputs=layer)

model.compile(loss='mean_squared_error', optimizer=Adam())

return model
return model

def tf_model(self):
self.model = tf.keras.estimator.model_to_estimator(keras_model=self.model, model_dir=self.vars.CHECK_PATH)

def tf_init_loaders(self):
return

def tf_train_and_eval(self):
return

def tf_predict(self, x):
return
4 changes: 0 additions & 4 deletions assign/models/text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ def compose_model(self):
layer = Bidirectional(CuDNNLSTM(units=512, return_sequences=True))(inp)
layer = Bidirectional(CuDNNLSTM(units=256, return_sequences=True))(layer)
layer = GlobalAveragePooling1D()(layer)
# layer = EXPAND_DIM()(layer)
# layer = Bidirectional(CuDNNLSTM(units=256, return_sequences=True))(layer)
# layer = GlobalAveragePooling1D()(layer)
# layer = Attention(self.vars.MAX_SENTENCE_LENGTH)(layer)
layer = Dense(256, activation='relu')(layer)
layer = Dropout(0.3)(layer)
layer = Dense(self.vars.NUM_DAYS_PRED)(layer)
Expand Down
31 changes: 31 additions & 0 deletions assign/scripts/find_data_inconsistencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from ..settings import vars
from ..utils.data_feeder import data_loader
from ..models import SENTENCE_ENCODER, SPEAKER_ENCODER

from transformers import BertTokenizer

from os import listdir

import tensorflow as tf

def find_inconsistencies():
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_encoder = SENTENCE_ENCODER(vars)
audio_encoder = SPEAKER_ENCODER(vars, graph=tf.get_default_graph())

all_errors = ''

for i in listdir(vars.DATA_PATH+'data/'):
for j in listdir(vars.DATA_PATH+'data/'+i):
_, _, err = data_loader(
vars, mode='test',
folder='/'+i+'/'+j,
encoder=audio_encoder,
tokenizer=tokenizer,
model=text_encoder, load_mode='both')

if len(err) > 0:
all_errors += '{} : {}\n'.format(i+'/'+j, err)

with open('inconsistencies.txt', 'w') as f:
f.write(all_errors)
40 changes: 25 additions & 15 deletions assign/utils/data_feeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,28 +30,33 @@ def data_loader(vars, mode='train', folder='', encoder=None, tokenizer=None, mod
data_folder = vars.DATA_PATH+'data'
batch_size = 0

if mode == 'train':
data_folder = data_folder+'/train/'
batch_size = vars.TRAIN_BATCH_SIZE
elif mode == 'valid':
data_folder = data_folder+'/valid/'
batch_size = vars.VALID_BATCH_SIZE
if len(folder) == 0:
if mode == 'train':
data_folder = data_folder+'/train/'
batch_size = vars.TRAIN_BATCH_SIZE
elif mode == 'valid':
data_folder = data_folder+'/valid/'
batch_size = vars.VALID_BATCH_SIZE
else:
data_folder = data_folder+'/test/'
batch_size = 1
else:
data_folder = data_folder+'/test/'
batch_size = 1

txts = []
auds = []
prices = []
err = ''

all_datas = listdir(data_folder)

while len(prices) < batch_size:
if len(folder) == 0:
idx = np.random.choice(np.arange(0, len(all_datas)), batch_size, replace=False)[0]
folder = all_datas[idx]

company, start_date = folder.split('_')
company, start_date = folder.split('_')
else:
company, start_date = folder[folder.rindex('/')+1:].split('_')

if load_mode == 'audio' or load_mode == 'both':
aud_features = []
Expand All @@ -73,6 +78,7 @@ def data_loader(vars, mode='train', folder='', encoder=None, tokenizer=None, mod

except Exception as e:
print('Can\'t read! : ', e)
err = 'Text Data Read Error : {}'.format(data_folder+folder)
features = []

features = np.array(features)
Expand All @@ -91,14 +97,16 @@ def data_loader(vars, mode='train', folder='', encoder=None, tokenizer=None, mod
cntr += 1
if cntr >= vars.MAX_SENTENCES:
break
if cntr == 0:
err = 'No Audio Found: {}'.format(data_folder+folder)
aud_features = np.array(aud_features)

if len(aud_features) > 0:
if len(aud_features) < vars.MAX_SENTENCES:
aud_features = np.concatenate((aud_features, np.zeros((vars.MAX_SENTENCES-len(aud_features), *np.shape(aud_features[0])))))


labels = load_target(vars, company, start_date)
labels, error = load_target(vars, company, start_date)

if type(labels) == np.ndarray:
is_avail = False
Expand Down Expand Up @@ -127,14 +135,16 @@ def data_loader(vars, mode='train', folder='', encoder=None, tokenizer=None, mod
prices.append(labels)
else:
if mode == 'test':
return np.array([]), []
err = 'Can\'t read target : {}, {}'.format(data_folder+folder, err)
return np.array([]), [], err
else:
err = 'Can\'t read target : {}'.format(data_folder+folder, error)
if mode == 'test':
return np.array([]), []
return np.array([]), [], err

if load_mode == 'audio':
return np.expand_dims(auds, axis=-1), np.array(prices)
return np.expand_dims(auds, axis=-1), np.array(prices), err
elif load_mode == 'text':
return np.array(txts), np.array(prices)
return np.array(txts), np.array(prices), err
else:
return [np.array(txts), np.expand_dims(auds, axis=-1)], np.array(prices)
return [np.array(txts), np.expand_dims(auds, axis=-1)], np.array(prices), err
2 changes: 1 addition & 1 deletion assign/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def load_target(vars, company, start_date):
all_data = pd.read_csv(target_path)
except Exception as e:
print('Error reading target : ', e)
return target
return target, e

all_data['formatted_date'] = pd.to_datetime(all_data['Date'])
all_data = all_data.sort_values(by='formatted_date', ascending=True)
Expand Down
Loading

0 comments on commit c52c958

Please sign in to comment.