Skip to content

Commit

Permalink
new file structure
Browse files Browse the repository at this point in the history
  • Loading branch information
constant-inos committed Dec 27, 2020
0 parents commit e5f77ff
Show file tree
Hide file tree
Showing 34 changed files with 3,300 additions and 0 deletions.
102 changes: 102 additions & 0 deletions agents/ActorCritic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #cut annoying tf messages
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp

from networks.networks import *


class Agent(object):
def __init__(self, n_actions=2,lr=0.01, gamma=0.99):
self.lr=lr
self.gamma=gamma
self.n_actions=n_actions
self.action_space = [i for i in range(n_actions)]

self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
self.actor_critic.compile(Adam(lr=lr))

self.action = None

def choose_action(self,state):
state = tf.convert_to_tensor([state])
_, probs = self.actor_critic(state)
action = np.random.choice(self.action_space,p=probs.numpy()[0])
return action

def save_model(self):
self.actor_critic.save_weights(self.actor_critic.model_name)

def load_model(self):
self.actor_critic.load_weights(self.actor_critic.model_name)

def learn(self,state,action,reward,state_,done):

state = tf.convert_to_tensor([state])
state_ = tf.convert_to_tensor([state_])
#action = tf.convert_to_tensor([action])
reward = tf.convert_to_tensor([reward])

with tf.GradientTape() as tape:
value, probs = self.actor_critic(state)
value_, probs_ = self.actor_critic(state_)
value = tf.squeeze(value)
value_ = tf.squeeze(value_)

action_probs = tfp.distributions.Categorical(probs=probs)
log_prob = action_probs.log_prob(tf.convert_to_tensor(action))

"""
log_prob = -sparse_categorical_crossentropy_with_logits
what is the loss function exactly ??? calculate it
(how tf works, sess, graph, fast?)
"""

delta = reward + self.gamma * value_ * (1-int(done)) - value
actor_loss = -log_prob * delta
critic_loss = delta**2

total_loss = actor_loss + critic_loss

gradient = tape.gradient(total_loss, self.actor_critic.trainable_variables)
self.actor_critic.optimizer.apply_gradients(zip(gradient,self.actor_critic.trainable_variables))



if __name__ == '__main__':
import gym

env = gym.make('CartPole-v0')
agent = Agent(lr= 0.9*1e-5,n_actions=env.action_space.n)
n_games = 2000

score_history = []
max_score, max_avg = 0,0


for i in range(n_games):
obs = env.reset()
done = False
score = 0
steps = 0
while not done:
action = agent.choose_action(obs)
obs_,reward,done,info = env.step(action)
score += reward
agent.learn(obs,action,reward,obs_,done)
obs = obs_
steps += 1
score_history.append(score)
avg_score = np.mean(score_history[-100:])

print('GAMES:',i,'SCORE:',score,'AVG SCORE:',avg_score)
if i % 100 == 0: print(max_score,max_avg)
if score > max_score: max_score = score
if avg_score > max_avg: max_avg = avg_score
123 changes: 123 additions & 0 deletions agents/DDQN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

from collections import deque
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np
import os
from networks.networks import *
from extras.experience_memory import *

class Agent(object):

def __init__(self, action_size, lr=0.0001, conv=False, batch_size=32, \
gamma=0.99, epsilon_max=1.0, epsilon_min=0.0001,\
update_target_freq=3000, train_interval=100, \
mem_size=50000, fname='mitsos_dqn.h5'):

self.action_size = action_size
self.action_space = [i for i in range(action_size)]
self.lr = lr
self.epsilon_max = epsilon_max
self.epsilon_min = epsilon_min
self.epsilon = epsilon_max
self.batch_size = batch_size
self.gamma = gamma
self.update_target_freq = update_target_freq
self.train_interval = train_interval
self.model_file = fname

self.memory = Memory(n_actions=action_size)

self.model = DQNetwork(action_size,conv=conv)
self.model.compile(loss='mse',optimizer=Adam(lr))
self.target_model = DQNetwork(action_size,conv=conv)

def choose_action(self,state):
if np.random.random() < self.epsilon:
action_idx = np.random.choice(self.action_space)
else:
state = tf.convert_to_tensor([state])
action = self.model(state).numpy()[0]
action_idx = np.argmax(action)
return action_idx

def store_experience(self,state,action,reward,new_state,done):
self.memory.store_experience(state,action,reward,new_state,1-int(done))

def learn(self):
if self.epsilon > self.epsilon_min:
self.epsilon -= (self.epsilon_max - self.epsilon_min) / 50000
if self.memory.memCounter % self.update_target_freq == 0:
self.update_target_model()

if not (self.memory.memCounter % self.train_interval == 0):
return

n_samples = min(self.batch_size*self.train_interval, self.memory.memCounter)
states,action_ind,rewards,new_states,notdones = self.memory.sample_memory(n_samples)

q_pred = self.model.predict(states)
q_eval = self.model.predict(new_states)
q_next = self.target_model.predict(new_states)
q_target = q_pred

sample_index = np.arange(n_samples)
#q_target[sample_index,np.argmax(q_target,axis=1)] = rewards[sample_index] + self.gamma*notdones[sample_index]*q_next[sample_index,np.argmax(q_eval,axis=1)]
q_target[sample_index,action_ind.astype(int)] = rewards[sample_index] + self.gamma*notdones[sample_index]*q_next[sample_index,np.argmax(q_eval,axis=1)]

self.model.fit(states,q_target,batch_size=self.batch_size,verbose=0)

return

def update_target_model(self):
self.target_model.set_weights(self.model.get_weights())
return

def save_model(self):
self.model.save_weights(self.model_file)

def load_model(self):
self.model.load_weights(self.model_file)
self.target_model.load_weights(self.model_file)



if __name__ == '__main__':
import gym
from statistics import *

env = gym.make('CartPole-v0')
agent = Agent(action_size=2)

dir_path = os.path.dirname(os.path.realpath(__file__))
L = Logger(dir=dir_path,fname='cartpole_ddqn')

n_games = 2000
scores = []
avg_score = 0

for i in range(n_games):
state = env.reset()
done = False
score = 0
while not done:
action = agent.choose_action(state)
new_state,reward,done,_ = env.step(action)
score += reward
agent.store_experience(state,action,reward,new_state,done)
state = new_state

agent.learn()

L.tick()

L.add_log('score',score)
L.save_game()
scores.append(score)
print('GAME:',i,'SCORE:',score,'AVG SCORE:',np.mean(scores[-100:]))
Loading

0 comments on commit e5f77ff

Please sign in to comment.