Skip to content


SAC tensorflow2
Browse files Browse the repository at this point in the history
  • Loading branch information
RickyMexx committed May 13, 2020
1 parent f49e711 commit f36ff92
Show file tree
Hide file tree
Showing 18 changed files with 530 additions and 0 deletions.
7 changes: 7 additions & 0 deletions SAC/
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
gamma = 0.99 # Discount factor for the
polyak_coef = 0.01 # Polyak parameter for the weights copy
temperature=0.3 #Also 0.2 # Temperature parameter for the entropy
lr=1e-3 # Learning rate for the networks
hidden_layers = 2 # Number of the hidden layers for the NNs
n_hidden_units = 60 # Number of hidden units per layer
167 changes: 167 additions & 0 deletions SAC/
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import tensorflow_addons as tfa
from typing import Sequence
from common.utils import *

def soft_update(source_vars: Sequence[tf.Variable], target_vars: Sequence[tf.Variable], tau: float) -> None:
"""Move each source variable by a factor of tau towards the corresponding target variable.
source_vars {Sequence[tf.Variable]} -- Source variables to copy from
target_vars {Sequence[tf.Variable]} -- Variables to copy data to
tau {float} -- How much to change to source var, between 0 and 1.
if len(source_vars) != len(target_vars):
raise ValueError("source_vars and target_vars must have the same length.")
for source, target in zip(source_vars, target_vars):
target.assign((1.0 - tau) * target + tau * source)

def hard_update(source_vars: Sequence[tf.Variable], target_vars: Sequence[tf.Variable]) -> None:
"""Copy source variables to target variables.
source_vars {Sequence[tf.Variable]} -- Source variables to copy from
target_vars {Sequence[tf.Variable]} -- Variables to copy data to
# Tau of 1, so get everything from source and keep nothing from target
soft_update(source_vars, target_vars, 1.0)

class SAC:
def __init__(self, obs_dim, n_actions, act_lim, seed, discount, temperature, polyak_coef, lr,
hidden_layers, n_hidden_units, save_dir, env):

self.obs_dim = obs_dim
self.n_actions = n_actions
self.act_lim = act_lim
self.seed = seed = discount
self.temperature = temperature
self.polyak_coef = polyak_coef = lr
self.save_dir = save_dir
self.env = env

### Creating networks and optimizers ###
# Policy network
# action_output are the squashed actions and action_original those straight from the normal distribution
logprob_epsilon = 1e-6 # For numerical stability when computing tf.log
self.actor_network = ActorNetwork(hidden_layers, n_hidden_units, n_actions, logprob_epsilon)

# 2 Soft q-functions networks + targets
self.softq_network = SoftQNetwork(hidden_layers, n_hidden_units)
self.softq_target_network = SoftQNetwork(hidden_layers, n_hidden_units)

self.softq_network2 = SoftQNetwork(hidden_layers, n_hidden_units)
self.softq_target_network2 = SoftQNetwork(hidden_layers, n_hidden_units)

# Building up 2 soft q-function with their relative targets
input1 = tf.keras.Input(shape=(obs_dim), dtype=tf.float32)
input2 = tf.keras.Input(shape=(n_actions), dtype=tf.float32)

self.softq_network(input1, input2)
self.softq_target_network(input1, input2)
hard_update(self.softq_network.variables, self.softq_target_network.variables)

self.softq_network2(input1, input2)
self.softq_target_network2(input1, input2)
hard_update(self.softq_network2.variables, self.softq_target_network2.variables)

# Optimizers for the networks
self.softq_optimizer = tfa.optimizers.RectifiedAdam(learning_rate=lr)
self.softq_optimizer2 = tfa.optimizers.RectifiedAdam(learning_rate=lr)
self.actor_optimizer = tfa.optimizers.RectifiedAdam(learning_rate=lr)

def softq_value(self, states: np.ndarray, actions: np.ndarray):
return self.softq_network(states, actions)

def softq_value2(self, states: np.ndarray, actions: np.ndarray):
return self.softq_network2(states, actions)

def actions(self, states: np.ndarray) -> np.ndarray:
"""Get the actions for a batch of states."""
return self.actor_network(states)[0]

def action(self, state: np.ndarray) -> np.ndarray:
"""Get the action for a single state."""
return self.actor_network(state[None, :])[0][0]

def step(self, obs):
return self.actor_network(obs)[0]

def train(self, sample, action_batch, batch_size):
state0_batch = sample["states0"]
reward_batch = sample["rewards"]
state1_batch = sample["states1"]
terminal1_batch = sample["terminals1"]

# Computing action and a_tilde
action, action_logprob2 = self.actor_network(state1_batch)

value_target1 = self.softq_target_network(state1_batch, action)
value_target2 = self.softq_target_network2(state1_batch, action)

# Taking the minimum of the q-functions values
next_value_batch = tf.math.minimum(value_target1, value_target2) - self.temperature * action_logprob2

# Computing target for q-functions
softq_targets = reward_batch + self.gamma * (1 - terminal1_batch) * tf.reshape(next_value_batch, [-1])
softq_targets = tf.reshape(softq_targets, [batch_size, 1])

# Gradient descent for the first q-function
with tf.GradientTape() as softq_tape:
softq = self.softq_network(state0_batch, action_batch)
softq_loss = tf.reduce_mean(tf.square(softq - softq_targets))

# Gradient descent for the second q-function
with tf.GradientTape() as softq_tape2:
softq2 = self.softq_network2(state0_batch, action_batch)
softq_loss2 = tf.reduce_mean(tf.square(softq2 - softq_targets))

# Gradient ascent for the policy (actor)
with tf.GradientTape() as actor_tape:
actions, action_logprob = self.actor_network(state0_batch)
new_softq = tf.math.minimum(self.softq_network(state0_batch, actions), self.softq_network2(state0_batch, actions))

# Loss implementation from the pseudocode -> works worse
#actor_loss = tf.reduce_mean(action_logprob - new_softq)

# New actor_loss -> works better
advantage = tf.stop_gradient(action_logprob - new_softq)
actor_loss = tf.reduce_mean(action_logprob * advantage)

# Computing the gradients with the tapes and applying them
actor_gradients = actor_tape.gradient(actor_loss, self.actor_network.trainable_weights)
softq_gradients = softq_tape.gradient(softq_loss, self.softq_network.trainable_weights)
softq_gradients2 = softq_tape2.gradient(softq_loss2, self.softq_network2.trainable_weights)

# Minimize gradients wrt weights
self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor_network.trainable_weights))
self.softq_optimizer.apply_gradients(zip(softq_gradients, self.softq_network.trainable_weights))
self.softq_optimizer2.apply_gradients(zip(softq_gradients2, self.softq_network2.trainable_weights))

# Update the weights of the soft q-function target networks
soft_update(self.softq_network.variables, self.softq_target_network.variables, self.polyak_coef)
soft_update(self.softq_network2.variables, self.softq_target_network2.variables, self.polyak_coef)

# Computing mean and variance of soft-q function
softq_mean, softq_variance = tf.nn.moments(softq, axes=[0])

return softq_mean[0], tf.sqrt(softq_variance[0]), softq_loss, actor_loss, tf.reduce_mean(action_logprob)

def save(self):
print("Model saved!")

def load(self, filepath):
print("Model loaded!")

100 changes: 100 additions & 0 deletions common/
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from common.utils import *

class Agent:
def __init__(self, model, replay_buffer, train_env, test_env, replay_start_size,
n_episodes, batch_size, n_actions):
self.model = model
self.replay_buffer = replay_buffer
self.train_env = train_env
self.test_env = test_env
self.replay_start_size = replay_start_size
self.batch_size = batch_size
self.n_episodes = n_episodes
self.n_actions = n_actions
self.batch_size = batch_size
self.n_timesteps = train_env.spec.tags.get("wrapper_config.TimeLimit.max_episode_steps")
self.total_steps = 0
self.total_episodes = 0

def train(self):
check = 1
episode_lengths = [None] * self.n_episodes
episode_rewards = [None] * self.n_episodes

# Parameters for the consecutive actions technique
cons_acts = 4
prob_act = 0.5

# Noise + epsilon parameters
noise = OUNoise(self.n_actions)
epsilon = 1
epsilon_min = 0.1
epsilon_dk = 0.999

for e in range(self.n_episodes):
state = self.train_env.reset().astype(np.float32)
episode_reward = 0
episode_length = 0

for k in range(self.n_timesteps):
action = self.model.action(state)

#### Techniques to force exploration, useful in sparse rewards environments ####

# Using the consecutive steps technique
if check == 1 and np.random.uniform() < prob_act:
# print(self.replay_buffer.n_entries)
for i in range(cons_acts):

# Using OUNoise technique + epsilon-greedy
if np.random.uniform() < epsilon:
action = noise.get_action(action, k)
if check==0 and epsilon > epsilon_min:
epsilon = epsilon * epsilon_dk

new_state, reward, done, _ = self.train_env.step(action)
new_state = new_state.astype(np.float32)
episode_length += 1
self.total_steps += 1
episode_reward += reward
self.replay_buffer.add(state, action, reward, new_state, done)
if self.replay_buffer.n_entries > self.replay_start_size:
if check == 1:
print("The buffer is ready, training is starting!")
check = 0

sample = self.replay_buffer.get_batch(self.batch_size)
softq_mean, softq_std, softq_loss, actor_loss, action_logprob_mean = self.model.train(sample,
np.resize(sample["actions"], [self.batch_size, self.n_actions]),

# print("Actor loss is", np.array(actor_loss))
# print("Q loss is", np.array(softq_loss))

state = new_state

if done:
episode_lengths[e] = k
episode_rewards[e] = episode_reward
self.total_episodes += 1
print("Episode n.", self.total_episodes, "is end! The reward is:", episode_reward,
", number of steps:", k)

plot_episode_stats(episode_lengths, episode_rewards)

def test(self, model_path):
while True:
obs, done = self.test_env.reset(), False
while not done:
action = self.model.action(obs.astype(np.float32))
obs, reward, done, info = self.test_env.step(action)

0 comments on commit f36ff92

Please sign in to comment.