gps_only.py

import gym
import DroneEnv
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import regularizers
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import pickle
import dronekit
from dronekit import LocationGlobalRelative

# np.random.seed(0)
# tf.random.set_seed(0)

if len(sys.argv) > 1:
    other_ip = sys.argv[1]
else:
    other_ip = '127.0.0.1'
lim_to_changes = 0.0000002 

## model_s4
homeLocation = LocationGlobalRelative(-35.363262, 149.165237, 40)
destLoaction = LocationGlobalRelative(-35.360262, 149.165237, 40)
adverseLocation = LocationGlobalRelative(-35.361262, 149.164537, 40)

problem = "drone-v1"
env = gym.make(problem,other_ip = other_ip, lim = lim_to_changes,dest = destLoaction,adverse = adverseLocation,reward_fn=0)

# num_states = env.observation_space.shape[0]
num_states = 3
print("Size of State Space ->  {}".format(num_states))
# num_actions = env.action_space.shape[0]
num_actions = 1
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = lim_to_changes
lower_bound = -lim_to_changes

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

dropped = 0

class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

class Buffer:
    def __init__(self, buffer_capacity=100000, batch_size=64):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size
        self.dropped = 0
        # Its tells us num of times record() was called.
        self.buffer_counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        # if obs_tuple[2] == 0:
        #     if np.random.rand()>0.05:
        #         self.dropped += 1
        #         return
        index = self.buffer_counter % self.buffer_capacity
        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1

    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
    # TensorFlow to build a static graph out of the logic and computations in our function.
    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
    @tf.function
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch,
    ):
        # Training and updating Actor & Critic networks.
        # See Pseudo Code.
        with tf.GradientTape() as tape:
            target_actions = target_actor(next_state_batch, training=True)
            y = reward_batch + gamma * target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad, critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = actor_model(state_batch, training=True)
            critic_value = critic_model([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        if record_range ==0:
            return
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)


# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))


def get_actor():
    #a
    last_init = tf.random_uniform_initializer(minval=-0.0001, maxval=0.0001)

    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(256, activation="relu")(inputs)
    out = layers.Dropout(0.3)(out)
    out = layers.Dense(256, activation="relu" )(out)
    out = layers.Dropout(0.3)(out)
    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)

    # Keep the outputs within the bound
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model


def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu" )(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(16, activation="relu" )(action_input)
    action_out = layers.Dense(32, activation="relu" )(action_out)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])
    concat = layers.Dropout(0.4)(concat)
    out = layers.Dense(256, activation="relu" )(concat)
    out = layers.Dense(256, activation="relu" )(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model


"""
`policy()` returns an action sampled from our Actor network plus some noise for
exploration.
"""


def policy(state, noise_object):
    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_object()
    # Adding noise to action
    sampled_actions = sampled_actions.numpy() + noise
    # We make sure action is within bounds
    legal_action = sampled_actions
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return [np.squeeze(legal_action)]

def policy_gaussian_noise(state, std_dev):
    sampled_actions = tf.squeeze(actor_model(state))

    sampled_actions = sampled_actions.numpy() + np.random.normal(0,std_dev)

    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return [np.squeeze(legal_action)]

def policy_gaussian_random(state, std_dev,prob):
    if np.random.rand() < prob:
        return  [np.array(np.random.uniform(low=lower_bound,high=upper_bound))]
    
    sampled_actions = tf.squeeze(actor_model(state))
    sampled_actions = sampled_actions.numpy() + np.random.normal(0,std_dev,size=(num_actions))
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return [np.squeeze(legal_action)]

def policy_ou_random(state,prob):
    if np.random.rand() < prob:
        return  [np.array(np.random.uniform(low=lower_bound,high=upper_bound))]

    sampled_actions = tf.squeeze(actor_model(state))
    noise = noise_obj()
    sampled_actions = sampled_actions.numpy() + noise
    return [np.squeeze(sampled_actions)]

def policy_without_noise(state):
    sampled_actions = tf.squeeze(actor_model(state))
    sampled_actions = sampled_actions.numpy()
    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
    return [np.squeeze(legal_action)]

def loadBuffer(buffer):
    tmp = np.loadtxt('state_buffer.txt',delimiter=',')
    size = min(buffer.buffer_capacity,tmp.shape[0])
    buffer.state_buffer[0:size,:] = tmp
    tmp = np.loadtxt('action_buffer.txt',delimiter=',')
    buffer.action_buffer[0:size,:] = tmp.reshape((size,1))
    tmp = np.loadtxt('reward_buffer.txt',delimiter=',')
    buffer.reward_buffer[0:size,:] = tmp.reshape((size,1))
    tmp = np.loadtxt('next_state_buffer.txt',delimiter=',')
    buffer.next_state_buffer[0:size,:] = tmp
    buffer.buffer_counter = size
    return buffer

"""
## Training hyperparameters
"""
factor = 1.2
factor = max(0.2,pow(0.9,14))

std_dev = factor * lim_to_changes
noise_obj = OUActionNoise(mean=np.zeros(1),std_deviation=(lim_to_changes*0.8),dt=1)

actor_model = get_actor()
critic_model = get_critic()

target_actor = get_actor()
target_critic = get_critic()

# Making the weights equal initially
target_actor.set_weights(actor_model.get_weights())
target_critic.set_weights(critic_model.get_weights())

# Learning rate for actor-critic models
critic_lr = 0.08
actor_lr = 0.06

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

total_episodes = 30
# Discount factor for future rewards
gamma = 0.95
# Used to update target networks
tau = 0.05

buffer = Buffer(50000, 64)

# Loading Previousy stored buffer and model weights
buffer = loadBuffer(buffer)
actor_model.load_weights('actor_1.h5')
critic_model.load_weights('critic_1.h5')
target_actor.load_weights('target_actor_1.h5')
target_critic.load_weights('target_critic_1.h5')

"""
Now we implement our main training loop, and iterate over episodes.
We sample actions using `policy()` and train with `learn()` at each time step,
along with updating the Target networks at a rate `tau`.
"""

# To store reward history of each episode
ep_reward_list = []
# To store average reward history of last few episodes
avg_reward_list = []


lst = []
award_file = open('awards.txt','w')

steps_list = []
for ep in range(total_episodes):
    prev_state = env.reset()
    episodic_reward = 0
    logging_file = open('log_actions.txt','w')
    steps = 0
    while True:
        # Uncomment this to see the Actor in action
        # But not in a python notebook.
        # env.render()
        steps += 1
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        action = policy_gaussian_random(tf_prev_state,std_dev,factor)
        # action = policy_ou_random(tf_prev_state,factor)

        # action = policy_without_noise(tf_prev_state)

        # Recieve state and reward from environment.
        modif = action[0]
        logging_file.write(str(modif) + '\n')
        state, reward, done, info = env.step(action)

        buffer.record((prev_state, action, reward, state))
        episodic_reward += reward
        lst.append(reward)
        buffer.learn()
        update_target(target_actor.variables, actor_model.variables, tau)
        update_target(target_critic.variables, critic_model.variables, tau)
        # End this episode when `done` is True
        if done:
            break

        prev_state = state
    steps_list.append(steps)
    ep_reward_list.append(episodic_reward)
    award_file.write(str(episodic_reward) + '\n')
    actor_model.save_weights("actor_1.h5")
    critic_model.save_weights("critic_1.h5")
    target_actor.save_weights("target_actor_1.h5")
    target_critic.save_weights("target_critic_1.h5")

    np.savetxt('state_buffer.txt',buffer.state_buffer[0:buffer.buffer_counter,:],delimiter=',')
    np.savetxt('action_buffer.txt',buffer.action_buffer[0:buffer.buffer_counter,:],delimiter=',')
    np.savetxt('reward_buffer.txt',buffer.reward_buffer[0:buffer.buffer_counter,:],delimiter=',')
    np.savetxt('next_state_buffer.txt',buffer.next_state_buffer[0:buffer.buffer_counter,:],delimiter=',')
    np.savetxt('complete_rewards_list.txt',np.array(lst),delimiter=',')

    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-5:])
    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
    avg_reward_list.append(avg_reward)
    print(ep_reward_list)
    print(steps_list)
    logging_file.close()
    factor = max(0.2,factor*0.9)
    std_dev = factor * lim_to_changes
    noise_obj.reset()

env.end_all()
# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()
award_file.close()

"""
If training proceeds correctly, the average episodic reward will increase with time.
Feel free to try different learning rates, `tau` values, and architectures for the
Actor and Critic networks.
The Inverted Pendulum problem has low complexity, but DDPG work great on many other
problems.
Another great environment to try this on is `LunarLandingContinuous-v2`, but it will take
more episodes to obtain good results.
"""

# Save the weights
actor_model.save_weights("actor_1.h5")
critic_model.save_weights("critic_1.h5")

target_actor.save_weights("target_actor_1.h5")
target_critic.save_weights("target_critic_1.h5")

np.savetxt('state_buffer.txt',buffer.state_buffer[0:buffer.buffer_counter,:],delimiter=',')
np.savetxt('action_buffer.txt',buffer.action_buffer[0:buffer.buffer_counter,:],delimiter=',')
np.savetxt('reward_buffer.txt',buffer.reward_buffer[0:buffer.buffer_counter,:],delimiter=',')
np.savetxt('next_state_buffer.txt',buffer.next_state_buffer[0:buffer.buffer_counter,:],delimiter=',')

print(ep_reward_list)