From c11acae80ccd614e9a9db89903683973d3553662 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 5 Aug 2024 22:23:43 +0200
Subject: [PATCH 01/37] Add Training State to APP Reinforcement Learning

---
 .../src/TrainingState.cpp                     |  91 ++++++++++++
 .../src/TrainingState.h                       | 137 ++++++++++++++++++
 2 files changed, 228 insertions(+)
 create mode 100644 lib/APPReinforcementLearning/src/TrainingState.cpp
 create mode 100644 lib/APPReinforcementLearning/src/TrainingState.h

diff --git a/lib/APPReinforcementLearning/src/TrainingState.cpp b/lib/APPReinforcementLearning/src/TrainingState.cpp
new file mode 100644
index 00000000..91e91597
--- /dev/null
+++ b/lib/APPReinforcementLearning/src/TrainingState.cpp
@@ -0,0 +1,91 @@
+/* MIT License
+ *
+ * Copyright (c) 2023 - 2024 Andreas Merkle <web@blue-andi.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*******************************************************************************
+    DESCRIPTION
+*******************************************************************************/
+/**
+ * @brief  Training state
+ * @author Akram Bziouech
+ */
+
+/******************************************************************************
+ * Includes
+ *****************************************************************************/
+#include "TrainingState.h"
+#include <StateMachine.h>
+#include <DifferentialDrive.h>
+
+/******************************************************************************
+ * Compiler Switches
+ *****************************************************************************/
+
+/******************************************************************************
+ * Macros
+ *****************************************************************************/
+
+/******************************************************************************
+ * Types and classes
+ *****************************************************************************/
+
+/******************************************************************************
+ * Prototypes
+ *****************************************************************************/
+
+/******************************************************************************
+ * Local Variables
+ *****************************************************************************/
+
+/******************************************************************************
+ * Public Methods
+ *****************************************************************************/
+
+void TrainingState::entry()
+{
+    DifferentialDrive& diffDrive = DifferentialDrive::getInstance();
+    diffDrive.setLinearSpeed(0, 0);
+    diffDrive.disable();
+}
+
+void TrainingState::process(StateMachine& sm)
+{
+    /* Nothing to do. */
+    (void)sm;
+}
+
+void TrainingState::exit()
+{
+    DifferentialDrive::getInstance().enable();
+}
+
+/******************************************************************************
+ * Protected Methods
+ *****************************************************************************/
+
+/******************************************************************************
+ * Private Methods
+ *****************************************************************************/
+
+/******************************************************************************
+ * External Functions
+ *****************************************************************************/
diff --git a/lib/APPReinforcementLearning/src/TrainingState.h b/lib/APPReinforcementLearning/src/TrainingState.h
new file mode 100644
index 00000000..737d7a89
--- /dev/null
+++ b/lib/APPReinforcementLearning/src/TrainingState.h
@@ -0,0 +1,137 @@
+/* MIT License
+ *
+ * Copyright (c) 2023 - 2024 Andreas Merkle <web@blue-andi.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*******************************************************************************
+    DESCRIPTION
+*******************************************************************************/
+/**
+ * @brief  Training state
+ * @author Akram Bziouech
+ *
+ * @addtogroup Application
+ *
+ * @{
+ */
+
+#ifndef TRAINING_STATE_H
+#define TRAINING_STATE_H
+
+/******************************************************************************
+ * Compile Switches
+ *****************************************************************************/
+
+/******************************************************************************
+ * Includes
+ *****************************************************************************/
+#include <stdint.h>
+#include <IState.h>
+#include <SimpleTimer.h>
+
+/******************************************************************************
+ * Macros
+ *****************************************************************************/
+
+/******************************************************************************
+ * Types and Classes
+ *****************************************************************************/
+
+/** The system Training state. */
+class TrainingState : public IState
+{
+public:
+    /**
+     * Get state instance.
+     *
+     * @return State instance.
+     */
+    static TrainingState& getInstance()
+    {
+        static TrainingState instance;
+
+        /* Singleton idiom to force initialization during first usage. */
+
+        return instance;
+    }
+
+    /**
+     * If the state is entered, this method will called once.
+     */
+    void entry() final;
+
+    /**
+     * Processing the state.
+     *
+     * @param[in] sm State machine, which is calling this state.
+     */
+    void process(StateMachine& sm) final;
+
+    /**
+     * If the state is left, this method will be called once.
+     */
+    void exit() final;
+
+protected:
+private:
+    /**
+     * Default constructor.
+     */
+    TrainingState()
+    {
+    }
+
+    /**
+     * Default destructor.
+     */
+    ~TrainingState()
+    {
+    }
+
+    /**
+     * Copy construction of an instance.
+     * Not allowed.
+     *
+     * @param[in] state Source instance.
+     */
+    TrainingState(const TrainingState& state);
+
+    /**
+     * Assignment of an instance.
+     * Not allowed.
+     *
+     * @param[in] state Source instance.
+     *
+     * @returns Reference to Training instance.
+     */
+    TrainingState& operator=(const TrainingState& state);
+};
+
+/******************************************************************************
+ * Functions
+ *****************************************************************************/
+
+#endif /* TRAINING_STATE_H */
+/** @} */
+
+/******************************************************************************
+ * Local Functions
+ *****************************************************************************/

From 8a75912b3e5201e260fdc95e8685dd483200a109 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 5 Aug 2024 22:28:13 +0200
Subject: [PATCH 02/37] Handle failure to send data over the serial Mux
 protocol

---
 lib/APPReinforcementLearning/src/App.cpp | 61 +++++++++++++-----------
 lib/APPReinforcementLearning/src/App.h   |  9 ++--
 2 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index 2f469d32..623ff636 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -42,6 +42,7 @@
 #include <Odometry.h>
 #include "ReadyState.h"
 #include <Logging.h>
+#include "TrainingState.h"
 
 /******************************************************************************
  * Compiler Switches
@@ -123,8 +124,7 @@ void App::loop()
             payload = {SMPChannelPayload::Status::DONE};
         }
 
-        /* Ignoring return value, as error handling is not available. */
-        (void)m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
+        m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
 
         m_statusTimer.restart();
     }
@@ -133,7 +133,23 @@ void App::loop()
     if (true == m_sendLineSensorsDataInterval.isTimeout() &&
         (&DrivingState::getInstance() == m_systemStateMachine.getState()))
     {
-        sendLineSensorsData();
+        ILineSensors&   lineSensors      = Board::getInstance().getLineSensors();
+        uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
+        const uint16_t* lineSensorValues = lineSensors.getSensorValues();
+        uint8_t         lineSensorIdx    = 0U;
+        LineSensorData  payload;
+
+        if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
+        {
+            while (maxLineSensors > lineSensorIdx)
+            {
+                payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx];
+
+                ++lineSensorIdx;
+            }
+        }
+
+        m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
 
         m_sendLineSensorsDataInterval.restart();
     }
@@ -148,13 +164,19 @@ void App::loop()
             SMPChannelPayload::Mode payload =
                 (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
 
-            /* Ignoring return value, as error handling is not available. */
-            (void)m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
+            m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
 
             m_modeSelectionSent = true;
         }
     }
 
+    if (false == m_data_sent)
+    {
+        /* Failed to send data to the supervisor. Go to error state. */
+        ErrorState::getInstance().setErrorMsg("DSF");
+        m_systemStateMachine.setState(&ErrorState::getInstance());
+    }
+
     m_smpServer.process(millis());
 
     m_systemStateMachine.process();
@@ -174,6 +196,13 @@ void App::handleRemoteCommand(const Command& cmd)
         m_modeSelectionSent = false;
         break;
 
+    case SMPChannelPayload::CmdId::CMD_ID_SET_TRAINING_STATE:
+        Odometry::getInstance().clearPosition();
+        Odometry::getInstance().clearMileage();
+
+        m_systemStateMachine.setState(&TrainingState::getInstance());
+        break;
+
     default:
         /* Nothing to do. */
         break;
@@ -188,28 +217,6 @@ void App::handleRemoteCommand(const Command& cmd)
  * Private Methods
  *****************************************************************************/
 
-void App::sendLineSensorsData() const
-{
-    ILineSensors&   lineSensors      = Board::getInstance().getLineSensors();
-    uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
-    const uint16_t* lineSensorValues = lineSensors.getSensorValues();
-    uint8_t         lineSensorIdx    = 0U;
-    LineSensorData  payload;
-
-    if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
-    {
-        while (maxLineSensors > lineSensorIdx)
-        {
-            payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx];
-
-            ++lineSensorIdx;
-        }
-    }
-
-    /* Ignoring return value, as error handling is not available. */
-    (void)m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
-}
-
 bool App::setupSerialMuxProt()
 {
     bool isSuccessful = false;
diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index 8615bfa0..4d73fc4a 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -73,6 +73,7 @@ class App
         m_statusTimer(),
         m_sendLineSensorsDataInterval(),
         m_modeSelectionSent(false),
+        m_data_sent(true),
         m_smpServer(Serial, this)
     {
     }
@@ -141,6 +142,9 @@ class App
     /** Ensue that the mode is only sent once */
     bool m_modeSelectionSent;
 
+    /** check if the data has been sent */
+    bool m_data_sent;
+
     /**
      * Setup the SerialMuxProt channels.
      *
@@ -148,11 +152,6 @@ class App
      */
     bool setupSerialMuxProt();
 
-    /**
-     * Send line sensors data via SerialMuxProt.
-     */
-    void sendLineSensorsData() const;
-
     /**
      * Copy construction of an instance.
      * Not allowed.

From e79d46fffc255202acf833dbd9fb153f7969ede0 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 5 Aug 2024 22:36:29 +0200
Subject: [PATCH 03/37] Delete unnecessary functions and add a timer to handle
 state changes

---
 .../src/ReadyState.cpp                        | 57 ++-----------------
 lib/APPReinforcementLearning/src/ReadyState.h | 21 +------
 2 files changed, 7 insertions(+), 71 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp
index 5e17cd6e..ec720077 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.cpp
+++ b/lib/APPReinforcementLearning/src/ReadyState.cpp
@@ -59,13 +59,6 @@
  * Local Variables
  *****************************************************************************/
 
-const uint16_t ReadyState::SENSOR_VALUE_MAX = Board::getInstance().getLineSensors().getSensorValueMax();
-
-/* Initialize the required sensor IDs to be generic. */
-const uint8_t ReadyState::SENSOR_ID_MOST_LEFT  = 0U;
-const uint8_t ReadyState::SENSOR_ID_MIDDLE     = Board::getInstance().getLineSensors().getNumLineSensors() / 2U;
-const uint8_t ReadyState::SENSOR_ID_MOST_RIGHT = Board::getInstance().getLineSensors().getNumLineSensors() - 1U;
-
 /******************************************************************************
  * Public Methods
  *****************************************************************************/
@@ -87,6 +80,7 @@ void ReadyState::entry()
         display.print(m_lapTime);
         display.print("ms");
     }
+    m_stateTransitionTimer.start(m_state_Transition_period);
     m_modeTimeoutTimer.start(mode_selected_period);
     m_mode                        = IDLE;
     m_isLastStartStopLineDetected = false;
@@ -99,11 +93,6 @@ void ReadyState::process(StateMachine& sm)
     IButton& buttonA = Board::getInstance().getButtonA();
     IButton& buttonB = Board::getInstance().getButtonB();
 
-    /* Get each sensor value. */
-    ILineSensors&   lineSensors      = Board::getInstance().getLineSensors();
-    uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
-    const uint16_t* lineSensorValues = lineSensors.getSensorValues();
-
     /* Shall the driving mode be released? */
     if (true == Util::isButtonTriggered(buttonA, m_isButtonAPressed))
     {
@@ -132,17 +121,11 @@ void ReadyState::process(StateMachine& sm)
         ;
     }
 
-    /**Drive forward until START LINE is crossed */
-    driveUntilStartLineisCrossed();
-
-    if (false == (isStartStopLineDetected(lineSensorValues, maxLineSensors)) && (true == m_isLastStartStopLineDetected))
+    /* Set DrivingState */
+    if (true == m_stateTransitionTimer.isTimeout())
     {
         sm.setState(&DrivingState::getInstance());
     }
-    else
-    {
-        m_isLastStartStopLineDetected = isStartStopLineDetected(lineSensorValues, maxLineSensors);
-    }
 }
 
 void ReadyState::exit()
@@ -174,45 +157,13 @@ ReadyState::ReadyState() :
     m_isButtonAPressed(false),
     m_isButtonBPressed(false),
     m_modeTimeoutTimer(),
+    m_stateTransitionTimer(),
     m_lapTime(0),
     m_isLastStartStopLineDetected(false),
     m_mode(IDLE)
 {
 }
 
-/**Drive forward until START LINE is crossed */
-void ReadyState::driveUntilStartLineisCrossed()
-{
-    DifferentialDrive& diffDrive  = DifferentialDrive::getInstance();
-    int16_t            top_speed  = 2000;           /* Set a top speed of 2000 */
-    int16_t            leftMotor  = top_speed / 2U; /* Drive at half speed */
-    int16_t            rightMotor = top_speed / 2U; /* Drive at half speed */
-    diffDrive.setLinearSpeed(leftMotor, rightMotor);
-}
-
-bool ReadyState::isStartStopLineDetected(const uint16_t* lineSensorValues, uint8_t length) const
-{
-    bool           isDetected  = false;
-    const uint32_t LINE_MAX_30 = (SENSOR_VALUE_MAX * 3U) / 10U; /* 30 % of max. value */
-    const uint32_t LINE_MAX_70 = (SENSOR_VALUE_MAX * 7U) / 10U; /* 70 % of max. value */
-
-    /*
-     * ===     =     ===
-     *   +   + + +   +
-     *   L     M     R
-     */
-    if ((LINE_MAX_30 <= lineSensorValues[SENSOR_ID_MOST_LEFT]) &&
-        (LINE_MAX_70 > lineSensorValues[SENSOR_ID_MIDDLE - 1U]) &&
-        (LINE_MAX_70 <= lineSensorValues[SENSOR_ID_MIDDLE]) &&
-        (LINE_MAX_70 > lineSensorValues[SENSOR_ID_MIDDLE + 1U]) &&
-        (LINE_MAX_30 <= lineSensorValues[SENSOR_ID_MOST_RIGHT]))
-    {
-        isDetected = true;
-    }
-
-    return isDetected;
-}
-
 /******************************************************************************
  * External Functions
  *****************************************************************************/
diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h
index 772003cd..76c59ebf 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.h
+++ b/lib/APPReinforcementLearning/src/ReadyState.h
@@ -117,8 +117,9 @@ class ReadyState : public IState
     };
 
     /** Duration of the selected mode in ms. This is the maximum time to select a mode. */
-    static const uint32_t mode_selected_period = 200U;
+    static const uint32_t mode_selected_period = 100;
 
+    static const uint32_t m_state_Transition_period = 150;
     /**
      * The line sensor threshold (normalized) used to detect the track.
      * The track is detected in case the received value is greater or equal than
@@ -153,7 +154,7 @@ class ReadyState : public IState
     uint32_t    m_lapTime;                     /**< Lap time in ms of the last successful driven round. */
     bool        m_isButtonAPressed;            /**< Is the button A pressed (last time)? */
     bool        m_isButtonBPressed;            /**< Is the button B pressed (last time)? */
-
+    SimpleTimer m_stateTransitionTimer;        /**< Timer to handle State changes */
     /**
      * Default constructor.
      */
@@ -183,22 +184,6 @@ class ReadyState : public IState
      * @returns Reference to ErrorState instance.
      */
     ReadyState& operator=(const ReadyState& state);
-
-    /*
-     * The robot moves from its current position
-     * until it crosses and leaves the start line.
-     */
-    void driveUntilStartLineisCrossed();
-
-    /**
-     * Is the start/stop line detected?
-     *
-     * @param[in] lineSensorValues  The line sensor values as array.
-     * @param[in] length            The number of line sensor values.
-     *
-     * @return If start/stop line detected, it will return true otherwise false.
-     */
-    bool isStartStopLineDetected(const uint16_t* lineSensorValues, uint8_t length) const;
 };
 
 /******************************************************************************

From cd3542ba1c6eddaa848124e4f564de5d725ac5f5 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 5 Aug 2024 22:40:52 +0200
Subject: [PATCH 04/37] Add constraint to SetSpeedTarget function to limit
 motor speeds between 0 and maxSpeed.

---
 lib/APPReinforcementLearning/src/DrivingState.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/DrivingState.cpp b/lib/APPReinforcementLearning/src/DrivingState.cpp
index 5ef59c6c..c95bb6aa 100644
--- a/lib/APPReinforcementLearning/src/DrivingState.cpp
+++ b/lib/APPReinforcementLearning/src/DrivingState.cpp
@@ -79,9 +79,15 @@ void DrivingState::exit()
     m_observationTimer.stop();
 }
 
-void DrivingState::setTargetSpeeds(int16_t leftMotor, int16_t rightMotor)
+void DrivingState::setTargetSpeeds(int16_t leftSpeed, int16_t rightSpeed)
 {
-    DifferentialDrive::getInstance().setLinearSpeed(leftMotor, rightMotor);
+    DifferentialDrive& diffDrive       = DifferentialDrive::getInstance();
+    const int16_t      MAX_MOTOR_SPEED = diffDrive.getMaxMotorSpeed();
+    const int16_t      MIN_MOTOR_SPEED = 0;
+    leftSpeed                          = constrain(leftSpeed, MIN_MOTOR_SPEED, MAX_MOTOR_SPEED);
+    rightSpeed                         = constrain(rightSpeed, MIN_MOTOR_SPEED, MAX_MOTOR_SPEED);
+
+    DifferentialDrive::getInstance().setLinearSpeed(leftSpeed, rightSpeed);
 }
 
 bool DrivingState::isAbortRequired()

From e79697f4062ef63a8767e91bc72f310153097e93 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 5 Aug 2024 22:45:02 +0200
Subject: [PATCH 05/37] Add Agent, TrajectoryBuffer, and Networks classes, and
 integrate them into the rl_supervisor.

---
 webots/controllers/RL_Supervisor/agent.py     | 501 ++++++++++++++++++
 webots/controllers/RL_Supervisor/networks.py  | 106 ++++
 .../RL_Supervisor/rl_supervisor.py            | 162 +++---
 .../RL_Supervisor/trajectory_buffer.py        | 143 +++++
 4 files changed, 849 insertions(+), 63 deletions(-)
 create mode 100644 webots/controllers/RL_Supervisor/agent.py
 create mode 100644 webots/controllers/RL_Supervisor/networks.py
 create mode 100644 webots/controllers/RL_Supervisor/trajectory_buffer.py

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
new file mode 100644
index 00000000..a6422069
--- /dev/null
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -0,0 +1,501 @@
+""" Implementation ofintelligent agent."""
+
+# MIT License
+#
+# Copyright (c) 2023 - 2024 Andreas Merkle <web@blue-andi.de>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+
+################################################################################
+# Imports
+################################################################################
+
+import struct
+import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
+from trajectory_buffer import Memory
+from networks import Networks
+
+################################################################################
+# Variables
+################################################################################
+
+# Constants
+COMMAND_CHANNEL_NAME = "CMD"
+CMD_DLC = 1
+
+SPEED_SET_CHANNEL_NAME = "SPEED_SET"
+SPEED_SET_DLC = 4
+
+LINE_SENSOR_CHANNEL_NAME = "LINE_SENS"
+LINE_SENSOR_ON_TRACK_MIN_VALUE = 200
+
+STATUS_CHANNEL_NAME = "STATUS"
+STATUS_CHANNEL_ERROR_VAL = 1
+
+MODE_CHANNEL_NAME = "MODE"
+CMD_ID_SET_READY_STATE = 1
+CMD_ID_SET_TRAINING_STATE = 2
+POSITION_DATA = [-0.24713614078815466, 0.01, 0.013994298332013683]
+ORIENTATION_DATA = [
+    -1.0564747468923541e-06,
+    8.746699709178704e-07,
+    0.9999999999990595,
+    1.5880805820884731,
+]
+MAX_SENSOR_VALUE = 1000
+MIN_STD_DEV = 0.1
+STD_DEV_FACTOR = 0.3995
+
+################################################################################
+# Classes
+################################################################################
+
+
+class Agent:  # pylint: disable=too-many-instance-attributes
+    """The Agent class represents an intelligent agent that makes decisions to
+    control motors based on the position of the robot."""
+
+    # pylint: disable=too-many-arguments
+    def __init__(
+        self,
+        smp_server,
+        gamma=0.99,
+        alpha=0.0003,
+        gae_lambda=0.95,
+        policy_clip=0.2,
+        batch_size=64,
+        chkpt_dir="models/",
+        top_speed=2000,
+        max_buffer_length=600,
+    ):
+        self.__serialmux = smp_server
+        self.__gamma = gamma
+        self.__alpha = alpha
+        self.__gae_lambda = gae_lambda
+        self.__policy_clip = policy_clip
+        self.__chkpt_dir = chkpt_dir
+        self.train_mode = False
+        self.__top_speed = top_speed
+        self.__memory = Memory(batch_size, max_buffer_length)
+        self.__neural_network = Networks(self.__alpha)
+        self.__training_index = 0  # Track batch index during training
+        self.__current_batch = None  # Saving of the current batch which is in process
+        self.__std_dev = 1
+        self.done = False
+        self.action = None
+        self.value = None
+        self.adjusted_log_prob = None
+        self.num_episodes = 0
+        self.state = "IDLE"
+        self.data_sent = True
+        self.unsent_data = []
+
+    def set_train_mode(self):
+        """Set the Agent mode to train mode."""
+        self.train_mode = True
+        self.state = "READY"
+
+    def set_drive_mode(self):
+        """Set the Agent mode to drive mode."""
+        self.train_mode = False
+        self.state = "READY"
+        self.num_episodes = 0
+
+    def store_transition(
+        self, state, action, probs, vals, reward, done
+    ):  # pylint: disable=too-many-arguments
+        """Store transitions in the replay buffer."""
+
+        self.__memory.store_memory(state, action, probs, vals, reward, done)
+
+    def save_models(self):
+        """Saves the models in the specified file."""
+
+        self.__neural_network.actor_network.save(self.__chkpt_dir + "actor.keras")
+        self.__neural_network.critic_network.save(self.__chkpt_dir + "critic.keras")
+
+    def load_models(self):
+        """Loads the models in the specified file."""
+
+        self.__neural_network.actor_network = tf.keras.models.load_model(
+            self.__chkpt_dir + "actor.keras", compile=False
+        )
+        self.__neural_network.critic_network = tf.keras.models.load_model(
+            self.__chkpt_dir + "critic.keras", compile=False
+        )
+
+    def predict_action(self, state):
+        """Predicts an action based on the current state."""
+
+        # Conversion of the state into a tensor
+        m_state = self.normalize_sensor_data(state)
+        state = tf.convert_to_tensor([m_state], dtype=tf.float32)
+
+        # Calculation of probabilities by the Actor neural network
+        probs = self.__neural_network.actor_network(state)
+
+        # Create a normal distribution with the calculated probabilities and the standard deviation
+        dist = tfp.distributions.Normal(probs, self.__std_dev)
+
+        # Sampling an action from the normal distribution
+        sampled_action = dist.sample()
+
+        # Apply the Tanh transformation to the sampled action
+        transformed_action = tf.tanh(sampled_action)
+
+        # Calculation of the logarithm of the probability density of the sampled action
+        log_prob = dist.log_prob(sampled_action)
+
+        # Calculation of the Jacobian determinant for the Tanh transformation
+        jacobian_log_det = tf.math.log(1 - tf.square(transformed_action) + 1e-6)
+
+        # Calculation of Adjusted probabilities by the neural network
+        adjusted_log_prob = log_prob - jacobian_log_det
+
+        # calculate the estimated value of a state, which is determined by the Critic network
+        value = self.__neural_network.critic_network(state)
+
+        self.action = transformed_action.numpy()[0]
+        self.value = value.numpy()[0]
+        self.adjusted_log_prob = adjusted_log_prob.numpy()[0]
+
+        return self.action
+
+    def send_motor_speeds(self, state):
+        """Sends the motor speeds to the robot."""
+
+        pre_action = self.predict_action(state)
+
+        # Get motor speed difference
+        speeddifference = self.__top_speed * pre_action
+
+        # Get individual motor speeds. The sign of speedDifference
+        # determines if the robot turns left or right.
+        left_motor_speed = int(self.__top_speed - speeddifference)
+        right_motor_speed = int(self.__top_speed + speeddifference)
+
+        control_data = struct.pack("2H", left_motor_speed, right_motor_speed)
+        self.data_sent = self.__serialmux.send_data("SPEED_SET", control_data)
+
+        # Failed to send data. Appends the data to unsent_data List.
+        if self.data_sent is False:
+            self.unsent_data.append(("SPEED_SET", control_data))
+
+    def update(self, robot_node):
+        """Checks if the sequence has ended, performs updates, and saves the models."""
+
+        # Checks whether the sequence has ended if it is set to Training mode.
+        if (self.train_mode is True) and (
+            (self.done is True) or (self.__memory.is_memory_full() is True)
+        ):
+            cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE)
+            self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
+
+            # Failed to send data. Appends the data to unsent_data List
+            if self.data_sent is False:
+                self.unsent_data.append(("CMD", cmd_payload))
+
+            self.reinitialize(robot_node)
+            self.state = "TRAINING"
+
+            # save models
+            if self.__memory.get_sum_rewards() >= 1000.0:
+                self.save_models()
+
+        # Checks whether the sequence has ended if it is set to driving mode.
+        if (self.train_mode is False) and (self.done is True):
+            self.done = False
+            motorcontrol = struct.pack("2H", 0, 0)
+            cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
+
+            self.data_sent = self.__serialmux.send_data(
+                "SPEED_SET", motorcontrol
+            )  # stop the motors immediately
+            # Failed to send data. Appends the data to unsent_data List
+            if self.data_sent is False:
+                self.unsent_data.append(("SPEED_SET", motorcontrol))
+
+            self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
+
+            # Failed to send data. Appends the data to unsent_data List
+            if self.data_sent is False:
+                self.unsent_data.append(("CMD", cmd_payload))
+
+    def normalize_sensor_data(self, sensor_data):
+        """The normalize_sensor_data function scales the sensor data to a range between 0 and 1."""
+
+        # Normalized sensor data
+        sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE
+
+        return sensor_data
+
+    def calculate_reward(self, sensor_data):
+        """
+        The calculate_reward function evaluates the consequences of a certain
+        action performed in a certain state by calculating the resulting reward
+        """
+        estimated_pos = self.calculate_position(sensor_data)
+
+        # Return reward between 0 and 10
+        if 500 <= estimated_pos <= 2000:
+            reward = ((1 / 150) * estimated_pos) - (10 / 3)
+            return reward
+
+        if 2000 < estimated_pos <= 3500:
+            reward = ((-1 / 150) * estimated_pos) + (70 / 3)
+            return reward
+
+        return 0
+
+    def calculate_position(self, sensor_data):
+        """
+        Determines the deviation and returns an estimated position of the robot
+        with respect to a line. The estimate is made using a weighted average of
+        the sensor indices multiplied by 1000, so that a return value of 0
+        indicates that the line is directly below sensor 0, a return value of
+        1000 indicates that the line is directly below sensor 1, 2000
+        indicates that it's below sensor 2000, etc.  Intermediate values
+        indicate that the line is between two sensors. The formula is:
+
+                0*value0 + 1000*value1 + 2000*value2 + ...
+            --------------------------------------------
+                value0  +  value1  +  value2 + ...
+
+        This function assumes a dark line (high values) surrounded by white
+        (low values).
+
+        Parameters
+        ----------
+        sensor_data : List
+
+        Returns
+        ----------
+        Estimated position with respect to track.
+        """
+
+        estimated_pos = 0
+        numerator = 0
+        denominator = 0
+        weight = 1000
+        is_on_line = False
+
+        max_sensors = len(sensor_data)
+        sensor_max_value = 1000
+        last_pos_value = 0
+
+        for idx, sensor_value in enumerate(sensor_data):
+            numerator += idx * weight * sensor_value
+            denominator += sensor_value
+
+            # Keep track of whether we see the line at all.
+            if LINE_SENSOR_ON_TRACK_MIN_VALUE < sensor_value:
+                is_on_line = True
+
+        if False is is_on_line:
+
+            # If it last read to the left of center, return 0.
+            if last_pos_value < ((max_sensors - 1) * sensor_max_value) / 2:
+                estimated_pos = 0
+
+            # If it last read to the right of center, return the max. value.
+            else:
+                estimated_pos = (max_sensors - 1) * sensor_max_value
+
+        else:
+            # Check to avoid division by zero..
+            if denominator != 0:
+                estimated_pos = numerator / denominator
+
+            last_pos_value = estimated_pos
+
+        return estimated_pos
+
+    def calculate_advantages(self, rewards, values, dones):
+        """Calculate advantages for each state in a mini-batch."""
+
+        mini_batch_size = len(rewards)
+
+        # Create empty advantages array
+        advantages = np.zeros(mini_batch_size, dtype=np.float32)
+
+        for start_index in range(mini_batch_size):
+            discount = 1
+            advantage = 0
+
+            for future_index in range(start_index, mini_batch_size - 1):
+
+                # Calculate the temporal difference (TD)
+                delta = (
+                    rewards[future_index]
+                    + (
+                        self.__gamma
+                        * values[future_index + 1]
+                        * (1 - int(dones[future_index]))
+                    )
+                    - values[future_index]
+                )
+
+                # Accumulate the advantage using the discount factor
+                advantage += discount * delta
+
+                # Update the discount factor for the next step
+                discount *= self.__gamma * self.__gae_lambda
+
+                # Stop if a terminal state is reached
+                if dones[future_index]:
+                    break
+
+            # Save the calculated advantage for the current state
+            advantages[start_index] = advantage
+
+        return advantages
+
+    def learn(
+        self, states, actions, old_probs, values, rewards, dones
+    ):  # pylint: disable=too-many-arguments
+        # pylint: disable=too-many-locals
+        """Perform training to optimize model weights"""
+
+        advantages = self.calculate_advantages(rewards, values, dones)
+
+        # optimize Actor Network weights
+        with tf.GradientTape() as tape:
+            states = tf.convert_to_tensor(states)
+            actions = tf.convert_to_tensor(actions)
+            old_probs = tf.convert_to_tensor(old_probs)
+
+            # Predicts an action based on the Saved states
+            predict = self.__neural_network.actor_network(states)
+            new_dist = tfp.distributions.Normal(predict, 0.1)
+            new_log_prob = new_dist.log_prob(actions)
+            transformed_action = tf.tanh(actions)
+            jacobian_log_det = tf.math.log(1 - tf.square(transformed_action))
+            adjusted_new_log_prob = new_log_prob - jacobian_log_det
+
+            # The ratio between the new model and the old model’s action log probabilities
+            prob_ratio = tf.exp(adjusted_new_log_prob - old_probs)
+
+            # If the ratio is too large or too small, it will be
+            # clipped according to the surrogate function.
+            weighted_probs = prob_ratio * advantages
+            clipped_probs = tf.clip_by_value(
+                prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip
+            )
+            weighted_clipped_probs = clipped_probs * advantages
+
+            # Policy Gradient Loss
+            actor_loss = -tf.reduce_mean(
+                tf.minimum(weighted_probs, weighted_clipped_probs)
+            )
+
+        # calculate gradient
+        actor_params = self.__neural_network.actor_network.trainable_variables
+        actor_grads = tape.gradient(actor_loss, actor_params)
+        self.__neural_network.actor_optimizer.apply_gradients(
+            zip(actor_grads, actor_params)
+        )
+
+        # optimize Critic Network weights
+        with tf.GradientTape() as tape:
+
+            critic_value = self.__neural_network.critic_network(states)
+            critic_value = tf.squeeze(critic_value, 1)
+            returns = advantages + values
+
+            # Generate loss
+            critic_loss = tf.keras.losses.MSE(critic_value, returns)
+
+        # calculate gradient
+        critic_params = self.__neural_network.critic_network.trainable_variables
+        critic_grads = tape.gradient(critic_loss, critic_params)
+        self.__neural_network.critic_optimizer.apply_gradients(
+            zip(critic_grads, critic_params)
+        )
+
+    def perform_training(self):
+        """Runs the training process."""
+
+        if self.__current_batch is None:
+
+            # Grab sample from memory
+            self.__current_batch = self.__memory.generate_batches()
+
+        # Perform training with mini batchtes.
+        if self.__training_index < len(self.__current_batch[-1]):
+            (
+                state_arr,
+                action_arr,
+                old_prob_arr,
+                vals_arr,
+                reward_arr,
+                dones_arr,
+                batches,
+            ) = self.__current_batch
+            batch = batches[self.__training_index]
+
+            self.learn(
+                state_arr[batch],
+                action_arr[batch],
+                old_prob_arr[batch],
+                vals_arr[batch],
+                reward_arr[batch],
+                dones_arr[batch],
+            )
+            self.__training_index += 1
+
+        # Training completed.
+        else:
+            self.__training_index = 0
+            self.__current_batch = None
+            self.done = False
+            self.__memory.clear_memory()
+            self.state = "IDLE"
+            self.num_episodes += 1
+            cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
+            self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
+
+            # Failed to send data. Appends the data to unsent_data List
+            if self.data_sent is False:
+                self.unsent_data.append(("CMD", cmd_payload))
+
+            # Minimize standard deviation until the minimum standard deviation is reached
+            self.__std_dev = self.__std_dev * STD_DEV_FACTOR
+            self.__std_dev = max(self.__std_dev, MIN_STD_DEV)
+
+    def reinitialize(self, robot_node):
+        """Re-initialization of position and orientation of The ROBOT."""
+        trans_field = robot_node.getField("translation")
+        rot_field = robot_node.getField("rotation")
+        initial_position = POSITION_DATA
+        initial_orientation = ORIENTATION_DATA
+        trans_field.setSFVec3f(initial_position)
+        rot_field.setSFRotation(initial_orientation)
+
+
+################################################################################
+# Functions
+################################################################################
+
+################################################################################
+# Main
+################################################################################
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
new file mode 100644
index 00000000..0328d60b
--- /dev/null
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -0,0 +1,106 @@
+"""Implementing Actor and Critic Networks"""
+
+# MIT License
+#
+# Copyright (c) 2023 - 2024 Andreas Merkle <web@blue-andi.de>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+
+################################################################################
+# Imports
+################################################################################
+
+import tensorflow as tf
+from tensorflow.keras import layers  # pylint: disable=import-error
+
+################################################################################
+# Variables
+################################################################################
+
+
+################################################################################
+# Classes
+################################################################################
+
+
+class Networks:
+    """Class for building networks of actors and critics"""
+
+    def __init__(self, alpha):
+        self.__learning_rate = alpha
+        self.actor_network = self.build_actor_network()
+        self.critic_network = self.build_critic_network()
+        self.actor_optimizer = tf.keras.optimizers.Adam(self.__learning_rate)
+        self.critic_optimizer = tf.keras.optimizers.Adam(self.__learning_rate)
+
+    def build_actor_network(self):
+        """Build Actor Network."""
+
+        state_input = layers.Input(shape=(5,))  # Assuming 5 sensor inputs
+        fc1 = layers.Dense(
+            64,
+            activation="relu",
+            kernel_initializer="he_normal",
+            bias_initializer="zeros",
+        )(state_input)
+        fc2 = layers.Dense(
+            64,
+            activation="relu",
+            kernel_initializer="he_normal",
+            bias_initializer="zeros",
+        )(fc1)
+        mu = layers.Dense(
+            1,
+            activation="tanh",
+            kernel_initializer="glorot_uniform",
+            bias_initializer="zeros",
+        )(fc2)
+
+        return tf.keras.models.Model(inputs=state_input, outputs=mu)
+
+    def build_critic_network(self):
+        """Build Critic Network"""
+
+        state_input = layers.Input(shape=(5,))  # Assuming 5 sensor inputs
+        fc1 = layers.Dense(
+            64,
+            activation="relu",
+            kernel_initializer="he_normal",
+            bias_initializer="zeros",
+        )(state_input)
+        fc2 = layers.Dense(
+            64,
+            activation="relu",
+            kernel_initializer="he_normal",
+            bias_initializer="zeros",
+        )(fc1)
+        value = layers.Dense(1)(fc2)  # Value output
+
+        return tf.keras.models.Model(inputs=state_input, outputs=value)
+
+
+################################################################################
+# Functions
+################################################################################
+
+################################################################################
+# Main
+################################################################################
diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index 309e9bd0..90880fe9 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -34,9 +34,11 @@
 
 import sys
 import struct
+import os
 from controller import Supervisor  # pylint: disable=import-error
 from serial_webots import SerialWebots
 from SerialMuxProt import Server
+from agent import Agent
 
 ################################################################################
 # Variables
@@ -62,14 +64,12 @@
 
 MODE_CHANNEL_NAME = "MODE"
 CMD_ID_SET_READY_STATE = 1
-POSITION_DATA = [-0.24713614078815466, -0.04863962992854465, 0.013994298332013683]
-ORIENTATION_DATA = [
-    -1.0564747468923541e-06,
-    8.746699709178704e-07,
-    0.9999999999990595,
-    1.5880805820884731,
-]
+MIN_NUMBER_OF_STEPS = 200
+SENSOR_ID_MOST_LEFT = 0
+SENSOR_ID_MOST_RIGHT = 4
 
+# Path of saved models
+PATH = "models/"
 
 ################################################################################
 # Classes
@@ -79,51 +79,28 @@
 class RobotController:
     """Class for data flow control logic"""
 
-    def __init__(self, robot_node, smp_server, tick_size):
-        self.__robot_node = robot_node
+    def __init__(self, smp_server, tick_size, agent):
         self.__smp_server = smp_server
+        self.__agent = agent
         self.__tick_size = tick_size
         self.__no_line_detection_count = 0
         self.__timestamp = 0  # Elapsed time since reset [ms]
-        self.is_sequence_complete = False
-
-    def reinitialize(self):
-        """Re-initialization of position and orientation of The ROBOT."""
-        trans_field = self.__robot_node.getField("translation")
-        rot_field = self.__robot_node.getField("rotation")
-        initial_position = POSITION_DATA
-        initial_orientation = ORIENTATION_DATA
-        trans_field.setSFVec3f(initial_position)
-        rot_field.setSFRotation(initial_orientation)
-
-    def callback_status(self, payload: bytearray) -> int:
+        self.last_sensor_data = None
+        self.steps = 0
+
+    def callback_status(self, payload: bytearray) -> None:
         """Callback Status Channel."""
 
         # perform action on robot status feedback
         if payload[0] == STATUS_CHANNEL_ERROR_VAL:
             print("robot has reached error-state (max. lap time passed in robot)")
 
-            # stop robot motors
-            self.__smp_server.send_data(SPEED_SET_CHANNEL_NAME, struct.pack("2H", 0, 0))
-
-            # switch robot to ready state
-            self.__smp_server.send_data(
-                COMMAND_CHANNEL_NAME, struct.pack("B", CMD_ID_SET_READY_STATE)
-            )
-
-            # perform robot state reset
-            self.is_sequence_complete = True
+            self.__agent.done = 1
 
     def callback_line_sensors(self, payload: bytearray) -> None:
         """Callback LINE_SENS Channel."""
         sensor_data = struct.unpack("5H", payload)
-
-        for idx in range(5):
-            if idx == 4:
-                print(f"Sensor[{idx}] = {sensor_data[idx]}")
-            else:
-                print(f"Sensor[{idx}] = {sensor_data[idx]}, ", end="")
-
+        self.steps += 1
         # determine lost line condition
         if all(value == 0 for value in sensor_data):
             self.__no_line_detection_count += 1
@@ -134,34 +111,67 @@ def callback_line_sensors(self, payload: bytearray) -> None:
         is_start_stop = all(
             value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data
         )
+        # Detect Start/Stop Line before Finish Trajectories
+        if (is_start_stop is True) and (self.steps < MIN_NUMBER_OF_STEPS):
+            sensor_data = list(sensor_data)
+            sensor_data[SENSOR_ID_MOST_LEFT] = 0
+            sensor_data[SENSOR_ID_MOST_RIGHT] = 0
+
+        # sequence stop criterion debounce no line detection and start/stop line detected
+        if self.__no_line_detection_count >= 30 or (
+            is_start_stop and (self.steps >= MIN_NUMBER_OF_STEPS)
+        ):
+            self.__agent.done = True
+            self.__no_line_detection_count = 0
 
-        # sequence stop criterion debounce no line detection and
-        if (self.__no_line_detection_count >= 20) or is_start_stop:
-            # Stop motors, maximum NO Line Detection Counter reached
-            self.__smp_server.send_data(SPEED_SET_CHANNEL_NAME, struct.pack("2H", 0, 0))
-
-            # SENDING A COMMAND ID SET READY STATUS
-            self.__smp_server.send_data(
-                COMMAND_CHANNEL_NAME, struct.pack("B", CMD_ID_SET_READY_STATE)
-            )
-            self.is_sequence_complete = True
-
-        else:
-            # [PLACEHOLDER] replace with line sensor data receiver
-
-            # Set left and right motors speed to 1000
-            self.__smp_server.send_data(
-                SPEED_SET_CHANNEL_NAME, struct.pack("2H", 1000, 1000)
-            )
+        # The sequence of states and actions is stored in memory for the training phase.
+        if self.__agent.train_mode:
+            reward = self.__agent.calculate_reward(sensor_data)
+
+            # Start storage The data after the second received sensor data
+            if self.last_sensor_data is not None:
+                normalized_sensor_data = self.__agent.normalize_sensor_data(sensor_data)
+                self.__agent.store_transition(
+                    normalized_sensor_data,
+                    self.__agent.action,
+                    self.__agent.adjusted_log_prob,
+                    self.__agent.value,
+                    reward,
+                    self.__agent.done,
+                )
+            self.last_sensor_data = sensor_data
+
+        # Sends the motor speeds to the robot.
+        if self.__agent.done is False and self.__agent.state == "READY":
+            self.__agent.send_motor_speeds(sensor_data)
 
     def callback_mode(self, payload: bytearray) -> None:
         """Callback MODE Channel."""
         driving_mode = payload[0]
 
         if driving_mode:
-            print("Driving Mode Selected.")
+            self.__agent.set_drive_mode()
+
         else:
-            print("Train Mode Selected.")
+            self.__agent.set_train_mode()
+
+    def load_models(self, path) -> None:
+        """Load Model if exist"""
+        if os.path.exists(path):
+            self.__agent.load_models()
+
+    def retry_unsent_data(self, unsent_data: list) -> bool:
+        """Resent any unsent Data"""
+        retry_succesful = True
+
+        # Resent the unsent Data.
+        for data in unsent_data[:]:
+            if self.__smp_server.send_data(data[0], data[1]) is True:
+                unsent_data.remove(data)
+            else:
+                retry_succesful = False
+
+        return retry_succesful
 
     def process(self):
         """function performing controller step"""
@@ -177,6 +187,7 @@ def process(self):
 
 
 # pylint: disable=duplicate-code
+# pylint: disable=too-many-branches
 def main_loop():
     """Main loop:
         - Perform simulation steps until Webots is stopping the controller.
@@ -229,9 +240,11 @@ def main_loop():
     if sermux_channel_cmd_id == 0:
         print("ERROR: channel CMD not created.")
         status = -1
+    # create instance of intelligence Agent
+    agent = Agent(smp_server)
 
     # create instance of robot logic class
-    controller = RobotController(robot_node, smp_server, timestep)
+    controller = RobotController(smp_server, timestep, agent)
 
     smp_server.subscribe_to_channel(STATUS_CHANNEL_NAME, controller.callback_status)
 
@@ -243,14 +256,37 @@ def main_loop():
 
     # setup successful
     if status != -1:
+
+        controller.load_models(PATH)
         # simulation loop
         while supervisor.step(timestep) != -1:
             controller.process()
 
-            # stopping condition for sequence was reached
-            if controller.is_sequence_complete is True:
-                controller.reinitialize()
-                controller.is_sequence_complete = False
+            # Checks if the sequence has ended
+            if agent.state == "READY":
+                agent.update(robot_node)
+
+            # Start the training
+            elif agent.state == "TRAINING":
+                controller.steps = 0
+                agent.perform_training()
+
+            if 1000 <= agent.num_episodes:
+                print(f"Episodes: {agent.num_episodes}")
+
+            # Resent any unsent Data
+            if agent.unsent_data:
+
+                # Stop The Simulation. Handel unsent Data
+                supervisor.simulationSetMode(Supervisor.SIMULATION_MODE_PAUSE)
+
+                # Set simulation mode to real time when unsent data is resent
+                if controller.retry_unsent_data(agent.unsent_data) is True:
+                    supervisor.simulationSetMode(Supervisor.SIMULATION_MODE_REAL_TIME)
+
+                # Reset The Simulation
+                else:
+                    supervisor.simulationReset()
 
     return status
 
diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
new file mode 100644
index 00000000..84de8a14
--- /dev/null
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -0,0 +1,143 @@
+""" Implementation of a Trajectory Buffer """
+
+# MIT License
+#
+# Copyright (c) 2023 - 2024 Andreas Merkle <web@blue-andi.de>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+
+################################################################################
+# Imports
+################################################################################
+
+import numpy as np
+
+################################################################################
+# Variables
+################################################################################
+
+
+################################################################################
+# Classes
+################################################################################
+
+
+class Memory:  # pylint: disable=too-many-instance-attributes
+    """Class for store and manage experience tuples during reinforcement learning"""
+
+    def __init__(self, batch_size, max_length):
+        self.__states = []
+        self.__probs = []
+        self.__vals = []
+        self.__actions = []
+        self.__rewards = []
+        self.__dones = []
+        self.__max_length = max_length
+        self.__batch_size = batch_size
+
+    def generate_batches(self):
+        """
+        Generates batches of data for training.
+
+        Returns
+        ----------
+        Numpy-Array: States
+        Numpy-Array: Actions
+        Numpy-Array: Probs
+        Numpy-Array: Vals
+        Numpy-Array: Rewards
+        Numpy-Array: dones
+        List: Batches
+        """
+
+        # Determine the number of states
+        n_states = len(self.__states)
+
+        # Calculate start indices for each batch
+        batch_start = np.arange(0, n_states, self.__batch_size)
+
+        # Create indices for the states and mix them randomly
+        indices = np.arange(n_states, dtype=np.int64)
+        np.random.shuffle(indices)
+
+        # Create batches by dividing the indices into groups of the batch_size
+        batches = [indices[indx : indx + self.__batch_size] for indx in batch_start]
+        print("Number of Steps:", n_states)
+
+        return (
+            np.array(self.__states),
+            np.array(self.__actions),
+            np.array(self.__probs),
+            np.array(self.__vals),
+            np.array(self.__rewards),
+            np.array(self.__dones),
+            batches,
+        )
+
+    def get_sum_rewards(self) -> float:
+        """Calculate sum of reward"""
+        sum_rewards = sum(self.__rewards)
+
+        return sum_rewards
+
+    def store_memory(
+        self, state, action, probs, vals, reward, done
+    ):  # pylint: disable=too-many-arguments
+        """
+        Store transitions in the replay buffer.
+
+        Parameters
+        ----------
+            state: The state observed.
+            action: The action taken.
+            prob: The probability of taking the action.
+            val: The estimated value of the state.
+            reward: The reward received.
+            done: Whether the episode is done.
+        """
+        self.__states.append(state)
+        self.__actions.append(action)
+        self.__probs.append(probs)
+        self.__vals.append(vals)
+        self.__rewards.append(reward)
+        self.__dones.append(done)
+
+    def clear_memory(self):
+        """Remove transitions from the replay buffer."""
+
+        self.__states = []
+        self.__probs = []
+        self.__vals = []
+        self.__actions = []
+        self.__rewards = []
+        self.__dones = []
+
+    def is_memory_full(self):
+        """
+        Checks whether Memory  has reached its maximum capacity.
+
+        Returns
+        ----------
+        - Bool: Memory is full or not
+        """
+
+        is_full = len(self.__states) >= self.__max_length
+        return is_full

From 4976b064fe36a2c2c5cd05c4faa4b48ebf56e4f6 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 5 Aug 2024 22:46:17 +0200
Subject: [PATCH 06/37] Add new Command ID

---
 lib/APPReinforcementLearning/src/SerialMuxchannels.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/SerialMuxchannels.h b/lib/APPReinforcementLearning/src/SerialMuxchannels.h
index 75daf582..a3f8f052 100644
--- a/lib/APPReinforcementLearning/src/SerialMuxchannels.h
+++ b/lib/APPReinforcementLearning/src/SerialMuxchannels.h
@@ -89,8 +89,9 @@ namespace SMPChannelPayload
     /** Remote control commands. */
     typedef enum : uint8_t
     {
-        CMD_ID_IDLE = 0,       /**< Nothing to do. */
-        CMD_ID_SET_READY_STATE /**< Set Ready State. */
+        CMD_ID_IDLE = 0,          /**< Nothing to do. */
+        CMD_ID_SET_READY_STATE,   /**< Set Ready State. */
+        CMD_ID_SET_TRAINING_STATE /**< Set Training State */
 
     } CmdId; /**< Command ID */
 

From 5c5b28d4f560bb469b4525465f4de48050856e77 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 6 Aug 2024 10:06:02 +0200
Subject: [PATCH 07/37] Add missing comment

---
 lib/APPReinforcementLearning/src/ReadyState.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h
index 76c59ebf..6a778fc4 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.h
+++ b/lib/APPReinforcementLearning/src/ReadyState.h
@@ -119,7 +119,9 @@ class ReadyState : public IState
     /** Duration of the selected mode in ms. This is the maximum time to select a mode. */
     static const uint32_t mode_selected_period = 100;
 
+    /** Duration to handle State changes in ms. */
     static const uint32_t m_state_Transition_period = 150;
+
     /**
      * The line sensor threshold (normalized) used to detect the track.
      * The track is detected in case the received value is greater or equal than

From e66ae2f47acc71d07958d2ee03c6efd7ccb8f167 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 6 Aug 2024 10:30:27 +0200
Subject: [PATCH 08/37] resolve pylint import error and correct comments

---
 webots/controllers/RL_Supervisor/agent.py     | 20 +++++++++----------
 webots/controllers/RL_Supervisor/networks.py  |  2 +-
 .../RL_Supervisor/rl_supervisor.py            |  7 ++++---
 .../RL_Supervisor/trajectory_buffer.py        |  4 ++--
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index a6422069..98383bcd 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -1,4 +1,4 @@
-""" Implementation ofintelligent agent."""
+"""Implementation of intelligent agent."""
 
 # MIT License
 #
@@ -29,15 +29,16 @@
 ################################################################################
 
 import struct
-import numpy as np
-import tensorflow as tf
-import tensorflow_probability as tfp
+import numpy as np  # pylint: disable=import-error
+import tensorflow as tf  # pylint: disable=import-error
+import tensorflow_probability as tfp  # pylint: disable=import-error
 from trajectory_buffer import Memory
 from networks import Networks
 
 ################################################################################
 # Variables
 ################################################################################
+# pylint: disable=duplicate-code
 
 # Constants
 COMMAND_CHANNEL_NAME = "CMD"
@@ -63,8 +64,8 @@
     1.5880805820884731,
 ]
 MAX_SENSOR_VALUE = 1000
-MIN_STD_DEV = 0.1
-STD_DEV_FACTOR = 0.3995
+MIN_STD_DEV = 0.1  # Minimum standard deviation
+STD_DEV_FACTOR = 0.3995  # Discounter standard deviation factor
 
 ################################################################################
 # Classes
@@ -370,10 +371,9 @@ def calculate_advantages(self, rewards, values, dones):
 
         return advantages
 
-    def learn(
-        self, states, actions, old_probs, values, rewards, dones
-    ):  # pylint: disable=too-many-arguments
-        # pylint: disable=too-many-locals
+    # pylint: disable=too-many-arguments
+    # pylint: disable=too-many-locals
+    def learn(self, states, actions, old_probs, values, rewards, dones):
         """Perform training to optimize model weights"""
 
         advantages = self.calculate_advantages(rewards, values, dones)
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index 0328d60b..84f9e98f 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -28,7 +28,7 @@
 # Imports
 ################################################################################
 
-import tensorflow as tf
+import tensorflow as tf  # pylint: disable=import-error
 from tensorflow.keras import layers  # pylint: disable=import-error
 
 ################################################################################
diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index 90880fe9..fa7388a8 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -43,7 +43,7 @@
 ################################################################################
 # Variables
 ################################################################################
-
+# pylint: disable=duplicate-code
 
 # Constants
 ROBOT_NAME = "ROBOT"
@@ -63,7 +63,7 @@
 STATUS_CHANNEL_ERROR_VAL = 1
 
 MODE_CHANNEL_NAME = "MODE"
-CMD_ID_SET_READY_STATE = 1
+
 MIN_NUMBER_OF_STEPS = 200
 SENSOR_ID_MOST_LEFT = 0
 SENSOR_ID_MOST_RIGHT = 4
@@ -188,6 +188,7 @@ def process(self):
 
 # pylint: disable=duplicate-code
 # pylint: disable=too-many-branches
+# pylint: disable=too-many-statements
 def main_loop():
     """Main loop:
         - Perform simulation steps until Webots is stopping the controller.
@@ -277,7 +278,7 @@ def main_loop():
             # Resent any unsent Data
             if agent.unsent_data:
 
-                # Stop The Simulation. Handel unsent Data
+                # Stop The Simulation. Handle unsent Data
                 supervisor.simulationSetMode(Supervisor.SIMULATION_MODE_PAUSE)
 
                 # Set simulation mode to real time when unsent data is resent
diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
index 84de8a14..01767d06 100644
--- a/webots/controllers/RL_Supervisor/trajectory_buffer.py
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -28,7 +28,7 @@
 # Imports
 ################################################################################
 
-import numpy as np
+import numpy as np  # pylint: disable=import-error
 
 ################################################################################
 # Variables
@@ -93,7 +93,7 @@ def generate_batches(self):
         )
 
     def get_sum_rewards(self) -> float:
-        """Calculate sum of reward"""
+        """Calculate total rewards"""
         sum_rewards = sum(self.__rewards)
 
         return sum_rewards

From 93d2b3975d0099345f0b9a4e79fd315f5994a616 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Wed, 7 Aug 2024 23:18:21 +0200
Subject: [PATCH 09/37] Add plotting script to analyze training logs

---
 webots/controllers/RL_Supervisor/agent.py    | 75 ++++++++++++++------
 webots/controllers/RL_Supervisor/plotting.py | 47 ++++++++++++
 2 files changed, 100 insertions(+), 22 deletions(-)
 create mode 100644 webots/controllers/RL_Supervisor/plotting.py

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index 98383bcd..b3cdc924 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -27,7 +27,8 @@
 ################################################################################
 # Imports
 ################################################################################
-
+import csv
+import os
 import struct
 import numpy as np  # pylint: disable=import-error
 import tensorflow as tf  # pylint: disable=import-error
@@ -65,7 +66,7 @@
 ]
 MAX_SENSOR_VALUE = 1000
 MIN_STD_DEV = 0.1  # Minimum standard deviation
-STD_DEV_FACTOR = 0.3995  # Discounter standard deviation factor
+STD_DEV_FACTOR = 0.99995  # Discounter standard deviation factor
 
 ################################################################################
 # Classes
@@ -110,6 +111,9 @@ def __init__(
         self.state = "IDLE"
         self.data_sent = True
         self.unsent_data = []
+        self.critic_loss_history = []
+        self.actor_loss_history = []
+        self.reward_history = []
 
     def set_train_mode(self):
         """Set the Agent mode to train mode."""
@@ -155,30 +159,33 @@ def predict_action(self, state):
         # Calculation of probabilities by the Actor neural network
         probs = self.__neural_network.actor_network(state)
 
-        # Create a normal distribution with the calculated probabilities and the standard deviation
-        dist = tfp.distributions.Normal(probs, self.__std_dev)
+        if self.train_mode is True:
+            # Create a normal distribution with the calculated probabilities and the standard deviation
+            dist = tfp.distributions.Normal(probs, self.__std_dev)
 
-        # Sampling an action from the normal distribution
-        sampled_action = dist.sample()
+            # Sampling an action from the normal distribution
+            sampled_action = dist.sample()
 
-        # Apply the Tanh transformation to the sampled action
-        transformed_action = tf.tanh(sampled_action)
+            # Apply the Tanh transformation to the sampled action
+            transformed_action = tf.tanh(sampled_action)
 
-        # Calculation of the logarithm of the probability density of the sampled action
-        log_prob = dist.log_prob(sampled_action)
+            # Calculation of the logarithm of the probability density of the sampled action
+            log_prob = dist.log_prob(sampled_action)
 
-        # Calculation of the Jacobian determinant for the Tanh transformation
-        jacobian_log_det = tf.math.log(1 - tf.square(transformed_action) + 1e-6)
+            # Calculation of the Jacobian determinant for the Tanh transformation
+            jacobian_log_det = tf.math.log(1 - tf.square(transformed_action) + 1e-6)
 
-        # Calculation of Adjusted probabilities by the neural network
-        adjusted_log_prob = log_prob - jacobian_log_det
+            # Calculation of Adjusted probabilities by the neural network
+            adjusted_log_prob = log_prob - jacobian_log_det
 
-        # calculate the estimated value of a state, which is determined by the Critic network
-        value = self.__neural_network.critic_network(state)
+            # calculate the estimated value of a state, which is determined by the Critic network
+            value = self.__neural_network.critic_network(state)
 
-        self.action = transformed_action.numpy()[0]
-        self.value = value.numpy()[0]
-        self.adjusted_log_prob = adjusted_log_prob.numpy()[0]
+            self.action = transformed_action.numpy()[0]
+            self.value = value.numpy()[0]
+            self.adjusted_log_prob = adjusted_log_prob.numpy()[0]
+        else:
+            self.action = probs.numpy()[0]
 
         return self.action
 
@@ -419,11 +426,9 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
         with tf.GradientTape() as tape:
 
             critic_value = self.__neural_network.critic_network(states)
-            critic_value = tf.squeeze(critic_value, 1)
             returns = advantages + values
-
             # Generate loss
-            critic_loss = tf.keras.losses.MSE(critic_value, returns)
+            critic_loss = tf.math.reduce_mean(tf.math.pow(returns - critic_value, 2))
 
         # calculate gradient
         critic_params = self.__neural_network.critic_network.trainable_variables
@@ -431,6 +436,32 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
         self.__neural_network.critic_optimizer.apply_gradients(
             zip(critic_grads, critic_params)
         )
+        self.actor_loss_history.append(actor_loss.numpy())
+        self.critic_loss_history.append(critic_loss.numpy())
+        self.reward_history.append(sum(rewards))
+
+        # saving logs in a CSV file
+        self.save_logs_to_csv()
+
+    def save_logs_to_csv(self):
+        """Function for saving logs in a CSV file"""
+
+        # Ensure the directory exists
+        log_dir = "logs"
+        os.makedirs(log_dir, exist_ok=True)
+        log_file = os.path.join(log_dir, "training_logs.csv")
+
+        with open(log_file, mode="w", encoding="utf-8", newline="") as file:
+            writer = csv.writer(file)
+            writer.writerow(["Actor Loss", "Critic Loss", "Reward"])
+            for indx, reward in enumerate(self.reward_history):
+                writer.writerow(
+                    [
+                        self.actor_loss_history[indx],
+                        self.critic_loss_history[indx],
+                        reward,
+                    ]
+                )
 
     def perform_training(self):
         """Runs the training process."""
diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py
new file mode 100644
index 00000000..a6afd7fc
--- /dev/null
+++ b/webots/controllers/RL_Supervisor/plotting.py
@@ -0,0 +1,47 @@
+""" Plotting script  with Matplotlib """
+
+# Imports
+import matplotlib.pyplot as plt
+import pandas as pd
+
+# Define the path to the CSV file
+LOG_FILE = "logs/training_logs.csv"
+
+# Read the CSV file
+data = pd.read_csv(LOG_FILE)
+
+# Define length
+data["Mini Batch"] = range(1, len(data) + 1)
+
+# Plotting Actor Loss
+plt.figure(figsize=(10, 5))
+plt.plot(data["Mini Batch"], data["Actor Loss"], label="Actor Loss")
+plt.xlabel("Mini Batch")
+plt.ylabel("Loss")
+plt.title("Actor Loss Over Mini Batches")
+plt.legend()
+plt.grid(True)
+plt.savefig("logs/actor_loss_plot.png")
+plt.show()
+
+# Plotting Critic Loss
+plt.figure(figsize=(10, 5))
+plt.plot(data["Mini Batch"], data["Critic Loss"], label="Critic Loss")
+plt.xlabel("Mini Batch")
+plt.ylabel("Loss")
+plt.title("Critic Loss Over Mini Batches")
+plt.legend()
+plt.grid(True)
+plt.savefig("logs/critic_loss_plot.png")
+plt.show()
+
+# Plotting Total Rewards
+plt.figure(figsize=(10, 5))
+plt.plot(data["Mini Batch"], data["Reward"], label="Reward")
+plt.xlabel("Mini Batch")
+plt.ylabel("Total Reward")
+plt.title("Total Rewards Over Mini Batches")
+plt.legend()
+plt.grid(True)
+plt.savefig("logs/total_rewards_plot.png")
+plt.show()

From 849771dbc18996f038cb4841cac7a02acfe392e3 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Thu, 8 Aug 2024 11:54:22 +0200
Subject: [PATCH 10/37] Add comments and fix import errors

---
 webots/controllers/RL_Supervisor/agent.py    | 14 +++++++++++---
 webots/controllers/RL_Supervisor/plotting.py |  4 ++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index b3cdc924..2d071d3b 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -160,7 +160,8 @@ def predict_action(self, state):
         probs = self.__neural_network.actor_network(state)
 
         if self.train_mode is True:
-            # Create a normal distribution with the calculated probabilities and the standard deviation
+            # Create a normal distribution with the calculated probabilities
+            # and the standard deviation
             dist = tfp.distributions.Normal(probs, self.__std_dev)
 
             # Sampling an action from the normal distribution
@@ -425,10 +426,17 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
         # optimize Critic Network weights
         with tf.GradientTape() as tape:
 
+            #  The critical value represents the expected return from state 𝑠𝑡.
+            # It provides an estimate of how good it is to be in a given state.
             critic_value = self.__neural_network.critic_network(states)
-            returns = advantages + values
+
+            # the total discounted reward accumulated from time step 𝑡
+            estimate_returns = advantages + values
+
             # Generate loss
-            critic_loss = tf.math.reduce_mean(tf.math.pow(returns - critic_value, 2))
+            critic_loss = tf.math.reduce_mean(
+                tf.math.pow(estimate_returns - critic_value, 2)
+            )
 
         # calculate gradient
         critic_params = self.__neural_network.critic_network.trainable_variables
diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py
index a6afd7fc..4b20bf63 100644
--- a/webots/controllers/RL_Supervisor/plotting.py
+++ b/webots/controllers/RL_Supervisor/plotting.py
@@ -1,8 +1,8 @@
 """ Plotting script  with Matplotlib """
 
 # Imports
-import matplotlib.pyplot as plt
-import pandas as pd
+import matplotlib.pyplot as plt  # pylint: disable=import-error
+import pandas as pd  # pylint: disable=import-error
 
 # Define the path to the CSV file
 LOG_FILE = "logs/training_logs.csv"

From 0a5001cc8286c0654e84ca5bdf21ff1e5f2f9261 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 19 Aug 2024 08:17:02 +0200
Subject: [PATCH 11/37] added comments, renamed variables, and updated values

---
 webots/controllers/RL_Supervisor/agent.py     | 64 ++++++++++---------
 webots/controllers/RL_Supervisor/networks.py  | 27 ++++++--
 .../RL_Supervisor/rl_supervisor.py            | 17 +++--
 3 files changed, 70 insertions(+), 38 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index 2d071d3b..19fa550b 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -102,7 +102,7 @@ def __init__(
         self.__neural_network = Networks(self.__alpha)
         self.__training_index = 0  # Track batch index during training
         self.__current_batch = None  # Saving of the current batch which is in process
-        self.__std_dev = 1
+        self.__std_dev = 0.9
         self.done = False
         self.action = None
         self.value = None
@@ -156,13 +156,14 @@ def predict_action(self, state):
         m_state = self.normalize_sensor_data(state)
         state = tf.convert_to_tensor([m_state], dtype=tf.float32)
 
-        # Calculation of probabilities by the Actor neural network
-        probs = self.__neural_network.actor_network(state)
+        # output from the Actor Network.
+        action_mean = self.__neural_network.actor_network(state)
 
+        # Training mode is set.
         if self.train_mode is True:
-            # Create a normal distribution with the calculated probabilities
-            # and the standard deviation
-            dist = tfp.distributions.Normal(probs, self.__std_dev)
+
+            # Create a normal distribution
+            dist = tfp.distributions.Normal(action_mean, self.__std_dev)
 
             # Sampling an action from the normal distribution
             sampled_action = dist.sample()
@@ -185,8 +186,10 @@ def predict_action(self, state):
             self.action = transformed_action.numpy()[0]
             self.value = value.numpy()[0]
             self.adjusted_log_prob = adjusted_log_prob.numpy()[0]
+
+        # Driving mode is set
         else:
-            self.action = probs.numpy()[0]
+            self.action = action_mean.numpy()[0]
 
         return self.action
 
@@ -240,6 +243,7 @@ def update(self, robot_node):
             self.data_sent = self.__serialmux.send_data(
                 "SPEED_SET", motorcontrol
             )  # stop the motors immediately
+
             # Failed to send data. Appends the data to unsent_data List
             if self.data_sent is False:
                 self.unsent_data.append(("SPEED_SET", motorcontrol))
@@ -253,29 +257,9 @@ def update(self, robot_node):
     def normalize_sensor_data(self, sensor_data):
         """The normalize_sensor_data function scales the sensor data to a range between 0 and 1."""
 
-        # Normalized sensor data
         sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE
-
         return sensor_data
 
-    def calculate_reward(self, sensor_data):
-        """
-        The calculate_reward function evaluates the consequences of a certain
-        action performed in a certain state by calculating the resulting reward
-        """
-        estimated_pos = self.calculate_position(sensor_data)
-
-        # Return reward between 0 and 10
-        if 500 <= estimated_pos <= 2000:
-            reward = ((1 / 150) * estimated_pos) - (10 / 3)
-            return reward
-
-        if 2000 < estimated_pos <= 3500:
-            reward = ((-1 / 150) * estimated_pos) + (70 / 3)
-            return reward
-
-        return 0
-
     def calculate_position(self, sensor_data):
         """
         Determines the deviation and returns an estimated position of the robot
@@ -339,6 +323,26 @@ def calculate_position(self, sensor_data):
 
         return estimated_pos
 
+    def calculate_reward(self, sensor_data):
+        """
+        The calculate_reward function evaluates the consequences of a certain
+        action performed in a certain state by calculating the resulting reward.
+        A reward of 1 means that the robot is in the center of the Line.
+        """
+        estimated_pos = self.calculate_position(sensor_data)
+
+        # Reward scaled between 0 and 1 If robot is in line.
+        if 500 <= estimated_pos <= 2000:
+            reward = (((1 / 150) * estimated_pos) - (10 / 3)) / 10
+            return reward
+
+        # Reward scaled between 1 and 0 If robot is in line.
+        if 2000 < estimated_pos <= 3500:
+            reward = (((-1 / 150) * estimated_pos) + (70 / 3))/10
+            return reward
+
+        return 0
+
     def calculate_advantages(self, rewards, values, dones):
         """Calculate advantages for each state in a mini-batch."""
 
@@ -386,7 +390,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
 
         advantages = self.calculate_advantages(rewards, values, dones)
 
-        # optimize Actor Network weights
+        # optimize Actor Network weights 
         with tf.GradientTape() as tape:
             states = tf.convert_to_tensor(states)
             actions = tf.convert_to_tensor(actions)
@@ -426,7 +430,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
         # optimize Critic Network weights
         with tf.GradientTape() as tape:
 
-            #  The critical value represents the expected return from state 𝑠𝑡.
+            # The critical value represents the expected return from state 𝑠𝑡.
             # It provides an estimate of how good it is to be in a given state.
             critic_value = self.__neural_network.critic_network(states)
 
@@ -479,7 +483,7 @@ def perform_training(self):
             # Grab sample from memory
             self.__current_batch = self.__memory.generate_batches()
 
-        # Perform training with mini batchtes.
+        # Perform training with mini batches.
         if self.__training_index < len(self.__current_batch[-1]):
             (
                 state_arr,
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index 84f9e98f..f3624593 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -30,6 +30,7 @@
 
 import tensorflow as tf  # pylint: disable=import-error
 from tensorflow.keras import layers  # pylint: disable=import-error
+from tensorflow.keras.regularizers import l2  # pylint: disable=import-error
 
 ################################################################################
 # Variables
@@ -59,22 +60,31 @@ def build_actor_network(self):
             64,
             activation="relu",
             kernel_initializer="he_normal",
+            kernel_regularizer=l2(0.01),
             bias_initializer="zeros",
         )(state_input)
         fc2 = layers.Dense(
             64,
             activation="relu",
             kernel_initializer="he_normal",
+            kernel_regularizer=l2(0.01),
             bias_initializer="zeros",
         )(fc1)
-        mu = layers.Dense(
+        fc3 = layers.Dense(
+            32,
+            activation="relu",
+            kernel_initializer="he_normal",
+            kernel_regularizer=l2(0.01),
+            bias_initializer="zeros",
+        )(fc2)
+        mean = layers.Dense(
             1,
             activation="tanh",
             kernel_initializer="glorot_uniform",
             bias_initializer="zeros",
-        )(fc2)
+        )(fc3)
 
-        return tf.keras.models.Model(inputs=state_input, outputs=mu)
+        return tf.keras.models.Model(inputs=state_input, outputs=mean)
 
     def build_critic_network(self):
         """Build Critic Network"""
@@ -84,15 +94,24 @@ def build_critic_network(self):
             64,
             activation="relu",
             kernel_initializer="he_normal",
+            kernel_regularizer=l2(0.01),
             bias_initializer="zeros",
         )(state_input)
         fc2 = layers.Dense(
             64,
             activation="relu",
             kernel_initializer="he_normal",
+            kernel_regularizer=l2(0.01),
             bias_initializer="zeros",
         )(fc1)
-        value = layers.Dense(1)(fc2)  # Value output
+        fc3 = layers.Dense(
+            32,
+            activation="relu",
+            kernel_initializer="he_normal",
+            kernel_regularizer=l2(0.01),
+            bias_initializer="zeros",
+        )(fc2)
+        value = layers.Dense(1)(fc3)  # Value output
 
         return tf.keras.models.Model(inputs=state_input, outputs=value)
 
diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index fa7388a8..1e13460d 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -101,13 +101,14 @@ def callback_line_sensors(self, payload: bytearray) -> None:
         """Callback LINE_SENS Channel."""
         sensor_data = struct.unpack("5H", payload)
         self.steps += 1
-        # determine lost line condition
+
+        # Determine lost line condition
         if all(value == 0 for value in sensor_data):
             self.__no_line_detection_count += 1
         else:
             self.__no_line_detection_count = 0
 
-        # detect start/stop line
+        # Detect start/stop line
         is_start_stop = all(
             value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data
         )
@@ -126,7 +127,12 @@ def callback_line_sensors(self, payload: bytearray) -> None:
 
         # The sequence of states and actions is stored in memory for the training phase.
         if self.__agent.train_mode:
-            reward = self.__agent.calculate_reward(sensor_data)
+
+            # receive a -1 punishment if the robot leaves the line
+            if self.__no_line_detection_count > 0:
+                reward = -1
+            else:
+                reward = self.__agent.calculate_reward(sensor_data)
 
             # Start storage The data after the second received sensor data
             if self.last_sensor_data is not None:
@@ -159,6 +165,8 @@ def load_models(self, path) -> None:
         """Load Model if exist"""
         if os.path.exists(path):
             self.__agent.load_models()
+        else:
+            print("No model available")
 
     def retry_unsent_data(self, unsent_data: list) -> bool:
         """Resent any unsent Data"""
@@ -259,6 +267,7 @@ def main_loop():
     if status != -1:
 
         controller.load_models(PATH)
+
         # simulation loop
         while supervisor.step(timestep) != -1:
             controller.process()
@@ -273,7 +282,7 @@ def main_loop():
                 agent.perform_training()
 
             if 1000 <= agent.num_episodes:
-                print(f"Episodes: {agent.num_episodes}")
+                print(f"The number of episodes:{agent.num_episodes}")
 
             # Resent any unsent Data
             if agent.unsent_data:

From 0de39f70800d515771c1aa43f3ffcc82a912f8aa Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 19 Aug 2024 08:29:15 +0200
Subject: [PATCH 12/37] remove trailing whitespace from code file

---
 webots/controllers/RL_Supervisor/agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index 19fa550b..eafb037e 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -390,7 +390,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
 
         advantages = self.calculate_advantages(rewards, values, dones)
 
-        # optimize Actor Network weights 
+        # optimize Actor Network weights
         with tf.GradientTape() as tape:
             states = tf.convert_to_tensor(states)
             actions = tf.convert_to_tensor(actions)

From cecb42785792d8087cf4acb0841b0d71aa7e15f3 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Thu, 22 Aug 2024 11:44:16 +0200
Subject: [PATCH 13/37] Reset sendLineSensorsData function to its original
 state to fix issues with sensor data transmission.

---
 lib/APPReinforcementLearning/src/App.cpp | 40 ++++++++++++++----------
 lib/APPReinforcementLearning/src/App.h   |  7 ++++-
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index 623ff636..d3e212a6 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -133,23 +133,7 @@ void App::loop()
     if (true == m_sendLineSensorsDataInterval.isTimeout() &&
         (&DrivingState::getInstance() == m_systemStateMachine.getState()))
     {
-        ILineSensors&   lineSensors      = Board::getInstance().getLineSensors();
-        uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
-        const uint16_t* lineSensorValues = lineSensors.getSensorValues();
-        uint8_t         lineSensorIdx    = 0U;
-        LineSensorData  payload;
-
-        if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
-        {
-            while (maxLineSensors > lineSensorIdx)
-            {
-                payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx];
-
-                ++lineSensorIdx;
-            }
-        }
-
-        m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
+        sendLineSensorsData();
 
         m_sendLineSensorsDataInterval.restart();
     }
@@ -217,6 +201,28 @@ void App::handleRemoteCommand(const Command& cmd)
  * Private Methods
  *****************************************************************************/
 
+void App::sendLineSensorsData() 
+{
+    ILineSensors&   lineSensors      = Board::getInstance().getLineSensors();
+    uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
+    const uint16_t* lineSensorValues = lineSensors.getSensorValues();
+    uint8_t         lineSensorIdx    = 0U;
+    LineSensorData  payload;
+
+    if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
+    {
+        while (maxLineSensors > lineSensorIdx)
+        {
+            payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx];
+
+            ++lineSensorIdx;
+        }
+    }
+
+    /* Ignoring return value, as error handling is not available. */
+    m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
+}
+
 bool App::setupSerialMuxProt()
 {
     bool isSuccessful = false;
diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index 4d73fc4a..2683c301 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -150,7 +150,12 @@ class App
      *
      * @return If successful returns true, otherwise false.
      */
-    bool setupSerialMuxProt();
+    bool setupSerialMuxProt()
+
+    /**
+     * Send line sensors data via SerialMuxProt.
+     */
+    void sendLineSensorsData() ;
 
     /**
      * Copy construction of an instance.

From 4e2aec2d8165a64a6e3674a9e501c4b6eddbd624 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Thu, 22 Aug 2024 11:54:37 +0200
Subject: [PATCH 14/37] Fix missing semicolon

---
 lib/APPReinforcementLearning/src/App.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index 2683c301..846292dd 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -150,12 +150,12 @@ class App
      *
      * @return If successful returns true, otherwise false.
      */
-    bool setupSerialMuxProt()
+    bool setupSerialMuxProt();
 
     /**
      * Send line sensors data via SerialMuxProt.
      */
-    void sendLineSensorsData() ;
+    void sendLineSensorsData();
 
     /**
      * Copy construction of an instance.

From 2935cb34497b403df462924ad882e98e5bc62821 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 26 Aug 2024 10:29:24 +0200
Subject: [PATCH 15/37] Apply coding style corrections

---
 lib/APPReinforcementLearning/src/App.cpp        | 16 +++++++++-------
 lib/APPReinforcementLearning/src/App.h          |  6 +++---
 lib/APPReinforcementLearning/src/ReadyState.cpp |  4 ++--
 lib/APPReinforcementLearning/src/ReadyState.h   |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index d3e212a6..9780d434 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -124,7 +124,7 @@ void App::loop()
             payload = {SMPChannelPayload::Status::DONE};
         }
 
-        m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
+        m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
 
         m_statusTimer.restart();
     }
@@ -133,7 +133,7 @@ void App::loop()
     if (true == m_sendLineSensorsDataInterval.isTimeout() &&
         (&DrivingState::getInstance() == m_systemStateMachine.getState()))
     {
-        sendLineSensorsData();
+        m_dataSent = sendLineSensorsData();
 
         m_sendLineSensorsDataInterval.restart();
     }
@@ -148,13 +148,13 @@ void App::loop()
             SMPChannelPayload::Mode payload =
                 (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
 
-            m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
+            m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
 
             m_modeSelectionSent = true;
         }
     }
 
-    if (false == m_data_sent)
+    if (false == m_dataSent)
     {
         /* Failed to send data to the supervisor. Go to error state. */
         ErrorState::getInstance().setErrorMsg("DSF");
@@ -201,12 +201,13 @@ void App::handleRemoteCommand(const Command& cmd)
  * Private Methods
  *****************************************************************************/
 
-void App::sendLineSensorsData() 
+bool App::sendLineSensorsData() const 
 {
     ILineSensors&   lineSensors      = Board::getInstance().getLineSensors();
     uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
     const uint16_t* lineSensorValues = lineSensors.getSensorValues();
     uint8_t         lineSensorIdx    = 0U;
+    bool            isDataSent       = true;
     LineSensorData  payload;
 
     if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
@@ -219,8 +220,9 @@ void App::sendLineSensorsData()
         }
     }
 
-    /* Ignoring return value, as error handling is not available. */
-    m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
+    isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
+
+    return isDataSent;
 }
 
 bool App::setupSerialMuxProt()
diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index 846292dd..c8dca206 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -73,7 +73,7 @@ class App
         m_statusTimer(),
         m_sendLineSensorsDataInterval(),
         m_modeSelectionSent(false),
-        m_data_sent(true),
+        m_dataSent(true),
         m_smpServer(Serial, this)
     {
     }
@@ -143,7 +143,7 @@ class App
     bool m_modeSelectionSent;
 
     /** check if the data has been sent */
-    bool m_data_sent;
+    bool m_dataSent;
 
     /**
      * Setup the SerialMuxProt channels.
@@ -155,7 +155,7 @@ class App
     /**
      * Send line sensors data via SerialMuxProt.
      */
-    void sendLineSensorsData();
+    bool sendLineSensorsData() const;
 
     /**
      * Copy construction of an instance.
diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp
index ec720077..1af24879 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.cpp
+++ b/lib/APPReinforcementLearning/src/ReadyState.cpp
@@ -80,8 +80,8 @@ void ReadyState::entry()
         display.print(m_lapTime);
         display.print("ms");
     }
-    m_stateTransitionTimer.start(m_state_Transition_period);
-    m_modeTimeoutTimer.start(mode_selected_period);
+    m_stateTransitionTimer.start(STATE_TRANSITION_PERIOD);
+    m_modeTimeoutTimer.start(MODE_SELECTED_PERIOD);
     m_mode                        = IDLE;
     m_isLastStartStopLineDetected = false;
     m_isButtonAPressed            = false;
diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h
index 6a778fc4..5cef49ec 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.h
+++ b/lib/APPReinforcementLearning/src/ReadyState.h
@@ -117,10 +117,10 @@ class ReadyState : public IState
     };
 
     /** Duration of the selected mode in ms. This is the maximum time to select a mode. */
-    static const uint32_t mode_selected_period = 100;
+    static const uint32_t MODE_SELECTED_PERIOD = 100U;
 
     /** Duration to handle State changes in ms. */
-    static const uint32_t m_state_Transition_period = 150;
+    static const uint32_t STATE_TRANSITION_PERIOD = 150U;
 
     /**
      * The line sensor threshold (normalized) used to detect the track.

From 5002ecd1cd7fee4a9c23105a320b0dca9979910a Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 26 Aug 2024 10:52:19 +0200
Subject: [PATCH 16/37] Add comments and refactor functions from agent to
 trajectory_buffer class

---
 lib/APPReinforcementLearning/src/App.h        |   2 +
 webots/controllers/RL_Supervisor/agent.py     | 341 ++++++++----------
 webots/controllers/RL_Supervisor/plotting.py  |   5 +
 .../RL_Supervisor/rl_supervisor.py            |  26 +-
 .../RL_Supervisor/trajectory_buffer.py        | 147 +++++++-
 5 files changed, 311 insertions(+), 210 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index c8dca206..bd3c6a1e 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -154,6 +154,8 @@ class App
 
     /**
      * Send line sensors data via SerialMuxProt.
+     * 
+     * * @return true if data has been sent, false otherwise.
      */
     bool sendLineSensorsData() const;
 
diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index eafb037e..69494144 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -33,7 +33,7 @@
 import numpy as np  # pylint: disable=import-error
 import tensorflow as tf  # pylint: disable=import-error
 import tensorflow_probability as tfp  # pylint: disable=import-error
-from trajectory_buffer import Memory
+from trajectory_buffer import Buffer
 from networks import Networks
 
 ################################################################################
@@ -66,7 +66,7 @@
 ]
 MAX_SENSOR_VALUE = 1000
 MIN_STD_DEV = 0.1  # Minimum standard deviation
-STD_DEV_FACTOR = 0.99995  # Discounter standard deviation factor
+STD_DEV_FACTOR = 0.9995  # Discounter standard deviation factor
 
 ################################################################################
 # Classes
@@ -88,21 +88,20 @@ def __init__(
         batch_size=64,
         chkpt_dir="models/",
         top_speed=2000,
-        max_buffer_length=600,
+        max_buffer_length=65536,
     ):
         self.__serialmux = smp_server
-        self.__gamma = gamma
         self.__alpha = alpha
-        self.__gae_lambda = gae_lambda
         self.__policy_clip = policy_clip
         self.__chkpt_dir = chkpt_dir
         self.train_mode = False
         self.__top_speed = top_speed
-        self.__memory = Memory(batch_size, max_buffer_length)
+        self.__buffer = Buffer(batch_size, max_buffer_length, gamma, gae_lambda)
         self.__neural_network = Networks(self.__alpha)
         self.__training_index = 0  # Track batch index during training
         self.__current_batch = None  # Saving of the current batch which is in process
         self.__std_dev = 0.9
+        self.n_epochs = 10
         self.done = False
         self.action = None
         self.value = None
@@ -127,11 +126,20 @@ def set_drive_mode(self):
         self.num_episodes = 0
 
     def store_transition(
-        self, state, action, probs, vals, reward, done
+        self, state, action, probs, value, reward, done
     ):  # pylint: disable=too-many-arguments
-        """Store transitions in the replay buffer."""
+        """Store transitions in the replay buffer.
 
-        self.__memory.store_memory(state, action, probs, vals, reward, done)
+        Parameters
+        ----------
+            state: The state observed.
+            action: The action taken.
+            prob: The probability of taking the action.
+            value: The estimated value of the state.
+            reward: The reward received.
+            done: Indicating whether the target sequence has been reached.
+        """
+        self.__buffer.store_memory(state, action, probs, value, reward, done)
 
     def save_models(self):
         """Saves the models in the specified file."""
@@ -150,13 +158,24 @@ def load_models(self):
         )
 
     def predict_action(self, state):
-        """Predicts an action based on the current state."""
+        """
+        Predicts an action based on the current state.
 
-        # Conversion of the state into a tensor
+        Parameters
+        ----------
+            state: The state observed.
+        
+        Returns
+        ----------
+            float32: The action taken.
+        """
+        # scales the sensor data to a range between 0 and 1
         m_state = self.normalize_sensor_data(state)
+
+        # Conversion of the state into a tensor
         state = tf.convert_to_tensor([m_state], dtype=tf.float32)
 
-        # output from the Actor Network.
+        # Forward pass through the actor network to get the action mean
         action_mean = self.__neural_network.actor_network(state)
 
         # Training mode is set.
@@ -194,17 +213,24 @@ def predict_action(self, state):
         return self.action
 
     def send_motor_speeds(self, state):
-        """Sends the motor speeds to the robot."""
+        """
+        Sends the motor speeds to the robot.
 
+        Parameters
+        ----------
+            state: The state observed.
+        """
+        # pre_action contains the predicted action for the given state, calculated based
+        # on the Actor model output.
         pre_action = self.predict_action(state)
 
         # Get motor speed difference
-        speeddifference = self.__top_speed * pre_action
+        speed_difference = self.__top_speed * pre_action
 
         # Get individual motor speeds. The sign of speedDifference
         # determines if the robot turns left or right.
-        left_motor_speed = int(self.__top_speed - speeddifference)
-        right_motor_speed = int(self.__top_speed + speeddifference)
+        left_motor_speed = int(self.__top_speed - speed_difference)
+        right_motor_speed = int(self.__top_speed + speed_difference)
 
         control_data = struct.pack("2H", left_motor_speed, right_motor_speed)
         self.data_sent = self.__serialmux.send_data("SPEED_SET", control_data)
@@ -214,26 +240,29 @@ def send_motor_speeds(self, state):
             self.unsent_data.append(("SPEED_SET", control_data))
 
     def update(self, robot_node):
-        """Checks if the sequence has ended, performs updates, and saves the models."""
+        """
+        Checks if the sequence has ended and performs updates.
+
+        Parameters
+        ----------
+            robot_node: The Robot interface
+        """
 
         # Checks whether the sequence has ended if it is set to Training mode.
         if (self.train_mode is True) and (
-            (self.done is True) or (self.__memory.is_memory_full() is True)
+            (self.done is True) or (self.__buffer.is_memory_full() is True)
         ):
             cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE)
             self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
 
-            # Failed to send data. Appends the data to unsent_data List
+            # Failed to send data. Appends the data to unsent_data List.
             if self.data_sent is False:
                 self.unsent_data.append(("CMD", cmd_payload))
 
+            # Stopping condition for sequence was reached.
             self.reinitialize(robot_node)
             self.state = "TRAINING"
 
-            # save models
-            if self.__memory.get_sum_rewards() >= 1000.0:
-                self.save_models()
-
         # Checks whether the sequence has ended if it is set to driving mode.
         if (self.train_mode is False) and (self.done is True):
             self.done = False
@@ -255,205 +284,131 @@ def update(self, robot_node):
                 self.unsent_data.append(("CMD", cmd_payload))
 
     def normalize_sensor_data(self, sensor_data):
-        """The normalize_sensor_data function scales the sensor data to a range between 0 and 1."""
-
-        sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE
-        return sensor_data
-
-    def calculate_position(self, sensor_data):
         """
-        Determines the deviation and returns an estimated position of the robot
-        with respect to a line. The estimate is made using a weighted average of
-        the sensor indices multiplied by 1000, so that a return value of 0
-        indicates that the line is directly below sensor 0, a return value of
-        1000 indicates that the line is directly below sensor 1, 2000
-        indicates that it's below sensor 2000, etc.  Intermediate values
-        indicate that the line is between two sensors. The formula is:
-
-                0*value0 + 1000*value1 + 2000*value2 + ...
-            --------------------------------------------
-                value0  +  value1  +  value2 + ...
-
-        This function assumes a dark line (high values) surrounded by white
-        (low values).
-
+        The normalize_sensor_data function scales the sensor data to a range between 0 and 1.
+        
         Parameters
         ----------
-        sensor_data : List
+            sensor_data: The state observed.
 
         Returns
         ----------
-        Estimated position with respect to track.
+            NumPy array of float32: Normalized Sensor Data
         """
 
-        estimated_pos = 0
-        numerator = 0
-        denominator = 0
-        weight = 1000
-        is_on_line = False
-
-        max_sensors = len(sensor_data)
-        sensor_max_value = 1000
-        last_pos_value = 0
-
-        for idx, sensor_value in enumerate(sensor_data):
-            numerator += idx * weight * sensor_value
-            denominator += sensor_value
-
-            # Keep track of whether we see the line at all.
-            if LINE_SENSOR_ON_TRACK_MIN_VALUE < sensor_value:
-                is_on_line = True
-
-        if False is is_on_line:
-
-            # If it last read to the left of center, return 0.
-            if last_pos_value < ((max_sensors - 1) * sensor_max_value) / 2:
-                estimated_pos = 0
-
-            # If it last read to the right of center, return the max. value.
-            else:
-                estimated_pos = (max_sensors - 1) * sensor_max_value
-
-        else:
-            # Check to avoid division by zero..
-            if denominator != 0:
-                estimated_pos = numerator / denominator
-
-            last_pos_value = estimated_pos
-
-        return estimated_pos
+        sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE
+        return sensor_data
 
-    def calculate_reward(self, sensor_data):
+    def determine_reward(self, sensor_data):
         """
-        The calculate_reward function evaluates the consequences of a certain
+        The function evaluates the consequences of a certain
         action performed in a certain state by calculating the resulting reward.
         A reward of 1 means that the robot is in the center of the Line.
-        """
-        estimated_pos = self.calculate_position(sensor_data)
 
-        # Reward scaled between 0 and 1 If robot is in line.
-        if 500 <= estimated_pos <= 2000:
-            reward = (((1 / 150) * estimated_pos) - (10 / 3)) / 10
-            return reward
+        Parameters
+        ----------
+            sensor_data : The state observed.
 
-        # Reward scaled between 1 and 0 If robot is in line.
-        if 2000 < estimated_pos <= 3500:
-            reward = (((-1 / 150) * estimated_pos) + (70 / 3))/10
-            return reward
+        Returns
+        ----------
+            float: the Resulting Reward
 
-        return 0
+        """
+        reward = self.__buffer.calculate_reward(sensor_data)
+        return reward
 
-    def calculate_advantages(self, rewards, values, dones):
-        """Calculate advantages for each state in a mini-batch."""
+    # pylint: disable=too-many-arguments
+    # pylint: disable=too-many-locals
+    def learn(self, states, actions, old_probs, values, rewards, dones):
+        """
+        Perform training to optimize model weights.
 
-        mini_batch_size = len(rewards)
+        Parameters
+        ----------
+            states:    The saved states observed during interactions with the environment.
+            actions:   The saved actions taken in response to the observed states.
+            old_probs: The saved probabilities of the actions taken, based on the previous policy.
+            values:    The saved estimated values of the observed states.
+            rewards:   The saved rewards received for taking the actions.
+            dones:     The saved flags indicating whether the target sequence or episode has 
+                        been completed.
+        """
+        for _ in range(self.n_epochs):
 
-        # Create empty advantages array
-        advantages = np.zeros(mini_batch_size, dtype=np.float32)
+            #the computed advantage values for each state in a given batch of experiences.
+            advantages = self.__buffer.calculate_advantages(rewards, values, dones)
 
-        for start_index in range(mini_batch_size):
-            discount = 1
-            advantage = 0
+            # optimize Actor Network weights
+            with tf.GradientTape() as tape:
+                states = tf.convert_to_tensor(states)
+                actions = tf.convert_to_tensor(actions)
+                old_probs = tf.convert_to_tensor(old_probs)
 
-            for future_index in range(start_index, mini_batch_size - 1):
+                # Forward pass through the actor network to get the action mean
+                predict_mean = self.__neural_network.actor_network(states)
 
-                # Calculate the temporal difference (TD)
-                delta = (
-                    rewards[future_index]
-                    + (
-                        self.__gamma
-                        * values[future_index + 1]
-                        * (1 - int(dones[future_index]))
-                    )
-                    - values[future_index]
-                )
+                # Create the normal distribution with the predicted mean
+                new_dist = tfp.distributions.Normal(predict_mean, self.__std_dev)
 
-                # Accumulate the advantage using the discount factor
-                advantage += discount * delta
+                # Invert the tanh transformation to recover the original actions before tanh
+                untransformed_actions = tf.atanh(actions)
 
-                # Update the discount factor for the next step
-                discount *= self.__gamma * self.__gae_lambda
+                new_log_prob = new_dist.log_prob(untransformed_actions)
 
-                # Stop if a terminal state is reached
-                if dones[future_index]:
-                    break
+                # Compute the log of the Jacobian for the tanh transformation
+                jacobian_log_det = tf.math.log(1 - tf.square(actions))
+                adjusted_new_log_prob = new_log_prob - jacobian_log_det
 
-            # Save the calculated advantage for the current state
-            advantages[start_index] = advantage
+                # The ratio between the new model and the old model’s action log probabilities
+                prob_ratio = tf.exp(adjusted_new_log_prob - old_probs)
 
-        return advantages
+                # If the ratio is too large or too small, it will be
+                # clipped according to the surrogate function.
+                weighted_probs = prob_ratio * advantages
+                clipped_probs = tf.clip_by_value(
+                    prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip
+                )
+                weighted_clipped_probs = clipped_probs * advantages
 
-    # pylint: disable=too-many-arguments
-    # pylint: disable=too-many-locals
-    def learn(self, states, actions, old_probs, values, rewards, dones):
-        """Perform training to optimize model weights"""
-
-        advantages = self.calculate_advantages(rewards, values, dones)
-
-        # optimize Actor Network weights
-        with tf.GradientTape() as tape:
-            states = tf.convert_to_tensor(states)
-            actions = tf.convert_to_tensor(actions)
-            old_probs = tf.convert_to_tensor(old_probs)
-
-            # Predicts an action based on the Saved states
-            predict = self.__neural_network.actor_network(states)
-            new_dist = tfp.distributions.Normal(predict, 0.1)
-            new_log_prob = new_dist.log_prob(actions)
-            transformed_action = tf.tanh(actions)
-            jacobian_log_det = tf.math.log(1 - tf.square(transformed_action))
-            adjusted_new_log_prob = new_log_prob - jacobian_log_det
-
-            # The ratio between the new model and the old model’s action log probabilities
-            prob_ratio = tf.exp(adjusted_new_log_prob - old_probs)
-
-            # If the ratio is too large or too small, it will be
-            # clipped according to the surrogate function.
-            weighted_probs = prob_ratio * advantages
-            clipped_probs = tf.clip_by_value(
-                prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip
-            )
-            weighted_clipped_probs = clipped_probs * advantages
+                # Policy Gradient Loss
+                actor_loss = -tf.reduce_mean(
+                    tf.minimum(weighted_probs, weighted_clipped_probs)
+                )
 
-            # Policy Gradient Loss
-            actor_loss = -tf.reduce_mean(
-                tf.minimum(weighted_probs, weighted_clipped_probs)
+            # calculate gradient
+            actor_params = self.__neural_network.actor_network.trainable_variables
+            actor_grads = tape.gradient(actor_loss, actor_params)
+            self.__neural_network.actor_optimizer.apply_gradients(
+                zip(actor_grads, actor_params)
             )
 
-        # calculate gradient
-        actor_params = self.__neural_network.actor_network.trainable_variables
-        actor_grads = tape.gradient(actor_loss, actor_params)
-        self.__neural_network.actor_optimizer.apply_gradients(
-            zip(actor_grads, actor_params)
-        )
-
-        # optimize Critic Network weights
-        with tf.GradientTape() as tape:
+            # optimize Critic Network weights
+            with tf.GradientTape() as tape:
 
-            # The critical value represents the expected return from state 𝑠𝑡.
-            # It provides an estimate of how good it is to be in a given state.
-            critic_value = self.__neural_network.critic_network(states)
+                # The critical value represents the expected return from state 𝑠𝑡.
+                # It provides an estimate of how good it is to be in a given state.
+                critic_value = self.__neural_network.critic_network(states)
 
-            # the total discounted reward accumulated from time step 𝑡
-            estimate_returns = advantages + values
+                # the total discounted reward accumulated from time step 𝑡
+                estimate_returns = advantages + values
 
-            # Generate loss
-            critic_loss = tf.math.reduce_mean(
-                tf.math.pow(estimate_returns - critic_value, 2)
-            )
+                # Generate loss
+                critic_loss = tf.math.reduce_mean(
+                    tf.math.pow(estimate_returns - critic_value, 2)
+                )
 
-        # calculate gradient
-        critic_params = self.__neural_network.critic_network.trainable_variables
-        critic_grads = tape.gradient(critic_loss, critic_params)
-        self.__neural_network.critic_optimizer.apply_gradients(
-            zip(critic_grads, critic_params)
-        )
-        self.actor_loss_history.append(actor_loss.numpy())
-        self.critic_loss_history.append(critic_loss.numpy())
-        self.reward_history.append(sum(rewards))
+                # calculate gradient
+                critic_params = self.__neural_network.critic_network.trainable_variables
+                critic_grads = tape.gradient(critic_loss, critic_params)
+                self.__neural_network.critic_optimizer.apply_gradients(
+                    zip(critic_grads, critic_params)
+                )
+                self.actor_loss_history.append(actor_loss.numpy())
+                self.critic_loss_history.append(critic_loss.numpy())
+                self.reward_history.append(sum(rewards))
 
-        # saving logs in a CSV file
-        self.save_logs_to_csv()
+                # saving logs in a CSV file
+                self.save_logs_to_csv()
 
     def save_logs_to_csv(self):
         """Function for saving logs in a CSV file"""
@@ -481,7 +436,7 @@ def perform_training(self):
         if self.__current_batch is None:
 
             # Grab sample from memory
-            self.__current_batch = self.__memory.generate_batches()
+            self.__current_batch = self.__buffer.generate_batches()
 
         # Perform training with mini batches.
         if self.__training_index < len(self.__current_batch[-1]):
@@ -511,7 +466,7 @@ def perform_training(self):
             self.__training_index = 0
             self.__current_batch = None
             self.done = False
-            self.__memory.clear_memory()
+            self.__buffer.clear_memory()
             self.state = "IDLE"
             self.num_episodes += 1
             cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
@@ -526,7 +481,13 @@ def perform_training(self):
             self.__std_dev = max(self.__std_dev, MIN_STD_DEV)
 
     def reinitialize(self, robot_node):
-        """Re-initialization of position and orientation of The ROBOT."""
+        """
+        Re-initialization of position and orientation of The ROBOT.
+
+        Parameters
+        ----------
+            robot_node: The Robot interface
+        """
         trans_field = robot_node.getField("translation")
         rot_field = robot_node.getField("rotation")
         initial_position = POSITION_DATA
diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py
index 4b20bf63..8efb92b3 100644
--- a/webots/controllers/RL_Supervisor/plotting.py
+++ b/webots/controllers/RL_Supervisor/plotting.py
@@ -1,5 +1,10 @@
 """ Plotting script  with Matplotlib """
 
+# Description: This script generates three separate plots using Matplotlib to visualize
+# the Actor Loss, Critic Loss, and Reward over time based on data collected during the
+# training process. Each plot represents the respective metric as a function of
+# mini-batch steps.
+
 # Imports
 import matplotlib.pyplot as plt  # pylint: disable=import-error
 import pandas as pd  # pylint: disable=import-error
diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index 1e13460d..8ae0838a 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -132,7 +132,7 @@ def callback_line_sensors(self, payload: bytearray) -> None:
             if self.__no_line_detection_count > 0:
                 reward = -1
             else:
-                reward = self.__agent.calculate_reward(sensor_data)
+                reward = self.__agent.determine_reward(sensor_data)
 
             # Start storage The data after the second received sensor data
             if self.last_sensor_data is not None:
@@ -272,17 +272,23 @@ def main_loop():
         while supervisor.step(timestep) != -1:
             controller.process()
 
-            # Checks if the sequence has ended
-            if agent.state == "READY":
-                agent.update(robot_node)
+            match (agent.state):
+                case "READY":
+                    agent.update(robot_node)
 
-            # Start the training
-            elif agent.state == "TRAINING":
-                controller.steps = 0
-                agent.perform_training()
+                case "TRAINING":
+                    controller.steps = 0
+                    agent.perform_training()
+                    print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f},"
+                            f"critic loss: {agent.critic_loss_history[-1]:.4f}")
 
-            if 1000 <= agent.num_episodes:
-                print(f"The number of episodes:{agent.num_episodes}")
+                    # save model
+                    if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0):
+                        print(f"The number of episodes:{agent.num_episodes}")
+                        agent.save_models()
+
+                case "IDLE":
+                    pass
 
             # Resent any unsent Data
             if agent.unsent_data:
diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
index 01767d06..44f17fce 100644
--- a/webots/controllers/RL_Supervisor/trajectory_buffer.py
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -40,18 +40,23 @@
 ################################################################################
 
 
-class Memory:  # pylint: disable=too-many-instance-attributes
+class Buffer:  # pylint: disable=too-many-instance-attributes
     """Class for store and manage experience tuples during reinforcement learning"""
 
-    def __init__(self, batch_size, max_length):
+    # pylint: disable=too-many-arguments
+    def __init__(self, batch_size, max_length, gamma, gae_lambda):
         self.__states = []
         self.__probs = []
         self.__vals = []
         self.__actions = []
         self.__rewards = []
         self.__dones = []
+        self.__batch_size = batch_size
         self.__max_length = max_length
         self.__batch_size = batch_size
+        self.__gamma = gamma
+        self.__gae_lambda = gae_lambda
+        self.__current_index = 0
 
     def generate_batches(self):
         """
@@ -76,11 +81,9 @@ def generate_batches(self):
 
         # Create indices for the states and mix them randomly
         indices = np.arange(n_states, dtype=np.int64)
-        np.random.shuffle(indices)
 
         # Create batches by dividing the indices into groups of the batch_size
         batches = [indices[indx : indx + self.__batch_size] for indx in batch_start]
-        print("Number of Steps:", n_states)
 
         return (
             np.array(self.__states),
@@ -93,7 +96,13 @@ def generate_batches(self):
         )
 
     def get_sum_rewards(self) -> float:
-        """Calculate total rewards"""
+        """
+        Calculate total rewards.
+
+        Returns
+        ----------
+            float: Total rewards received.
+        """
         sum_rewards = sum(self.__rewards)
 
         return sum_rewards
@@ -108,8 +117,8 @@ def store_memory(
         ----------
             state: The state observed.
             action: The action taken.
-            prob: The probability of taking the action.
-            val: The estimated value of the state.
+            probs: The probability of taking the action.
+            vals: The estimated value of the state.
             reward: The reward received.
             done: Whether the episode is done.
         """
@@ -119,9 +128,10 @@ def store_memory(
         self.__vals.append(vals)
         self.__rewards.append(reward)
         self.__dones.append(done)
+        self.__current_index += 1
 
     def clear_memory(self):
-        """Remove transitions from the replay buffer."""
+        """Remove transitions from the trajektories buffer."""
 
         self.__states = []
         self.__probs = []
@@ -132,12 +142,129 @@ def clear_memory(self):
 
     def is_memory_full(self):
         """
-        Checks whether Memory  has reached its maximum capacity.
+        Checks whether Memory has reached its maximum capacity.
 
         Returns
         ----------
         - Bool: Memory is full or not
         """
 
-        is_full = len(self.__states) >= self.__max_length
+        is_full = self.__current_index >= self.__max_length
         return is_full
+
+    def calculate_advantages(self, rewards, values, dones):
+        """
+        The function measures how much better or worse an action 
+        performed in a given state compared to the average action 
+        the policy would take in that state.
+
+        Parameters
+        ----------
+            rewards: The rewards received.
+            values: The estimated values of the states.
+            dones: Indicating whether the target sequence has been reached.
+
+        Returns
+        ----------
+            NumPy array of float32: the computed advantage values for each 
+            state in a given batch of experiences.
+        """
+
+        mini_batch_size = rewards.shape[0]
+
+        # Create empty advantages array
+        advantages = np.zeros(mini_batch_size, dtype=np.float32)
+
+        for start_index in range(mini_batch_size):
+            discount = 1
+            advantage = 0
+
+            for future_index in range(start_index, mini_batch_size - 1):
+
+                # Calculate the temporal difference (TD)
+                delta = (
+                    rewards[future_index]
+                    + (
+                        self.__gamma
+                        * values[future_index + 1]
+                        * (1 - int(dones[future_index]))
+                    )
+                    - values[future_index]
+                )
+
+                # Accumulate the advantage using the discount factor
+                advantage += discount * delta
+
+                # Update the discount factor for the next step
+                discount *= self.__gamma * self.__gae_lambda
+
+                # Stop if a terminal state is reached
+                if dones[future_index]:
+                    break
+
+            # Save the calculated advantage for the current state
+            advantages[start_index] = advantage
+
+        return advantages
+
+    def calculate_position(self, sensor_data):
+        """
+        Determines the deviation and returns an estimated position of the robot
+        with respect to a line. The estimate is made using a weighted average of
+        the sensor indices multiplied by 1000, so that a return value of 0
+        indicates that the line is directly below sensor 0, a return value of
+        1000 indicates that the line is directly below sensor 1, 2000
+        indicates that it's below sensor 2000, etc.  Intermediate values
+        indicate that the line is between two sensors. The formula is:
+
+                0*value0 + 1000*value1 + 2000*value2 + ...
+            --------------------------------------------
+                value0  +  value1  +  value2 + ...
+
+        This function assumes a dark line (high values) surrounded by white
+        (low values).
+
+        Parameters
+        ----------
+            sensor_data : The state observed.
+
+        Returns
+        ----------
+            float: Estimated position with respect to track.
+        """
+        estimated_pos = 0.0
+        numerator = 0.0
+        denominator = 0.0
+        weight = 1000.0
+
+        for idx, sensor_value in enumerate(sensor_data):
+            numerator += idx * weight * sensor_value
+            denominator += sensor_value
+
+        if denominator > 0:
+            estimated_pos = numerator / denominator
+
+        return estimated_pos
+
+    def calculate_reward(self, sensor_data):
+        """
+        The calculate_reward function evaluates the consequences of a certain
+        action performed in a certain state by calculating the resulting reward.
+        A reward of 1 means that the robot is in the center of the Line.
+
+        Parameters
+        ----------
+            sensor_data : The state observed.
+
+        Returns
+        ----------
+            float: the Resulting Reward
+        """
+        reward = 0.0
+        estimated_pos = self.calculate_position(sensor_data)
+
+        # Reward scaled between 0 and 1 based on the estimated position
+        if 500 <= estimated_pos <= 3500:
+            reward = 1.0 - abs(estimated_pos - 2000.0) / 1500.0
+
+        return reward

From d4f6d5a1bfb901b04b255d2bc46dee9470bbbf2d Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 26 Aug 2024 11:11:18 +0200
Subject: [PATCH 17/37] Replace match-case with if-else in RobotController
 class for Python 3.9 compatibility

---
 .../RL_Supervisor/rl_supervisor.py            | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index 8ae0838a..f5f7ac8d 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -272,23 +272,21 @@ def main_loop():
         while supervisor.step(timestep) != -1:
             controller.process()
 
-            match (agent.state):
-                case "READY":
-                    agent.update(robot_node)
-
-                case "TRAINING":
-                    controller.steps = 0
-                    agent.perform_training()
-                    print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f},"
-                            f"critic loss: {agent.critic_loss_history[-1]:.4f}")
+            if agent.state == "READY":
+                agent.update(robot_node)
+
+            # Start the training
+            elif agent.state == "TRAINING":
+                supervisor.last_sensor_data = None
+                controller.steps = 0
+                agent.perform_training()
 
-                    # save model
-                    if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0):
-                        print(f"The number of episodes:{agent.num_episodes}")
-                        agent.save_models()
+                print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f},"
+                            f"critic loss: {agent.critic_loss_history[-1]:.4f}")
 
-                case "IDLE":
-                    pass
+                # save model
+                if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0):
+                    agent.save_models()
 
             # Resent any unsent Data
             if agent.unsent_data:

From da2ac90fb08c2aa49bedd5d870980f816174e180 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 3 Sep 2024 11:47:51 +0200
Subject: [PATCH 18/37] Replace member variable with local variable in loop()
 to prevent side effects

---
 lib/APPReinforcementLearning/src/App.cpp | 15 ++++++++-------
 lib/APPReinforcementLearning/src/App.h   |  4 ----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index 9780d434..a4cf8cb9 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -96,6 +96,7 @@ void App::loop()
 {
     Board::getInstance().process();
     Speedometer::getInstance().process();
+    bool  isDataSent = true;
 
     if (true == m_controlInterval.isTimeout())
     {
@@ -124,7 +125,7 @@ void App::loop()
             payload = {SMPChannelPayload::Status::DONE};
         }
 
-        m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
+        isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
 
         m_statusTimer.restart();
     }
@@ -133,7 +134,7 @@ void App::loop()
     if (true == m_sendLineSensorsDataInterval.isTimeout() &&
         (&DrivingState::getInstance() == m_systemStateMachine.getState()))
     {
-        m_dataSent = sendLineSensorsData();
+        isDataSent = sendLineSensorsData();
 
         m_sendLineSensorsDataInterval.restart();
     }
@@ -148,13 +149,13 @@ void App::loop()
             SMPChannelPayload::Mode payload =
                 (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
 
-            m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
+            isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
 
             m_modeSelectionSent = true;
         }
     }
 
-    if (false == m_dataSent)
+    if (false == isDataSent)
     {
         /* Failed to send data to the supervisor. Go to error state. */
         ErrorState::getInstance().setErrorMsg("DSF");
@@ -207,7 +208,7 @@ bool App::sendLineSensorsData() const
     uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
     const uint16_t* lineSensorValues = lineSensors.getSensorValues();
     uint8_t         lineSensorIdx    = 0U;
-    bool            isDataSent       = true;
+    bool            isPayloadSent       = true;
     LineSensorData  payload;
 
     if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
@@ -220,9 +221,9 @@ bool App::sendLineSensorsData() const
         }
     }
 
-    isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
+    isPayloadSent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload));
 
-    return isDataSent;
+    return isPayloadSent;
 }
 
 bool App::setupSerialMuxProt()
diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index bd3c6a1e..6fec26c9 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -73,7 +73,6 @@ class App
         m_statusTimer(),
         m_sendLineSensorsDataInterval(),
         m_modeSelectionSent(false),
-        m_dataSent(true),
         m_smpServer(Serial, this)
     {
     }
@@ -142,9 +141,6 @@ class App
     /** Ensue that the mode is only sent once */
     bool m_modeSelectionSent;
 
-    /** check if the data has been sent */
-    bool m_dataSent;
-
     /**
      * Setup the SerialMuxProt channels.
      *

From d39d111585829c2fcf8955ae0d3c603669fa5f2f Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 3 Sep 2024 11:52:23 +0200
Subject: [PATCH 19/37] Increase the duration of the selected mode and the
 duration to handle state changes.

---
 lib/APPReinforcementLearning/src/ReadyState.cpp | 1 +
 lib/APPReinforcementLearning/src/ReadyState.h   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp
index 1af24879..fc8babaa 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.cpp
+++ b/lib/APPReinforcementLearning/src/ReadyState.cpp
@@ -125,6 +125,7 @@ void ReadyState::process(StateMachine& sm)
     if (true == m_stateTransitionTimer.isTimeout())
     {
         sm.setState(&DrivingState::getInstance());
+        m_stateTransitionTimer.restart();
     }
 }
 
diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h
index 5cef49ec..8626a46a 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.h
+++ b/lib/APPReinforcementLearning/src/ReadyState.h
@@ -117,10 +117,10 @@ class ReadyState : public IState
     };
 
     /** Duration of the selected mode in ms. This is the maximum time to select a mode. */
-    static const uint32_t MODE_SELECTED_PERIOD = 100U;
+    static const uint32_t MODE_SELECTED_PERIOD = 2000U;
 
     /** Duration to handle State changes in ms. */
-    static const uint32_t STATE_TRANSITION_PERIOD = 150U;
+    static const uint32_t STATE_TRANSITION_PERIOD = 2100U;
 
     /**
      * The line sensor threshold (normalized) used to detect the track.

From 812fbb50fce48ac517980c91be5a440cdc590654 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 3 Sep 2024 12:04:37 +0200
Subject: [PATCH 20/37] Rename classes, fix bugs in calculate_advantages, and
 update hyperparameter values

---
 webots/controllers/RL_Supervisor/agent.py     | 70 ++++++++++---------
 webots/controllers/RL_Supervisor/networks.py  | 23 +++---
 .../RL_Supervisor/rl_supervisor.py            | 26 +++----
 .../RL_Supervisor/trajectory_buffer.py        | 32 +++++----
 4 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index 69494144..f45ef80a 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -33,8 +33,8 @@
 import numpy as np  # pylint: disable=import-error
 import tensorflow as tf  # pylint: disable=import-error
 import tensorflow_probability as tfp  # pylint: disable=import-error
-from trajectory_buffer import Buffer
-from networks import Networks
+from trajectory_buffer import Memory
+from networks import Models
 
 ################################################################################
 # Variables
@@ -57,16 +57,16 @@
 MODE_CHANNEL_NAME = "MODE"
 CMD_ID_SET_READY_STATE = 1
 CMD_ID_SET_TRAINING_STATE = 2
-POSITION_DATA = [-0.24713614078815466, 0.01, 0.013994298332013683]
+POSITION_DATA = [-0.24713614078815466, -0.04863962992854465, 0.013994298332013683]
 ORIENTATION_DATA = [
     -1.0564747468923541e-06,
     8.746699709178704e-07,
     0.9999999999990595,
-    1.5880805820884731,
+    1.5880805820884731
 ]
 MAX_SENSOR_VALUE = 1000
-MIN_STD_DEV = 0.1  # Minimum standard deviation
-STD_DEV_FACTOR = 0.9995  # Discounter standard deviation factor
+MIN_STD_DEV = 0.01  # Minimum standard deviation
+STD_DEV_FACTOR = 0.995  # Discounter standard deviation factor
 
 ################################################################################
 # Classes
@@ -74,15 +74,18 @@
 
 
 class Agent:  # pylint: disable=too-many-instance-attributes
-    """The Agent class represents an intelligent agent that makes decisions to
-    control motors based on the position of the robot."""
+    """
+    The Agent class represents an intelligent agent that makes decisions to
+    control motors based on the position of the robot.
+    """
 
     # pylint: disable=too-many-arguments
     def __init__(
         self,
         smp_server,
         gamma=0.99,
-        alpha=0.0003,
+        actor_alpha=0.0001,
+        critic_alpha=0.0003,
         gae_lambda=0.95,
         policy_clip=0.2,
         batch_size=64,
@@ -91,17 +94,16 @@ def __init__(
         max_buffer_length=65536,
     ):
         self.__serialmux = smp_server
-        self.__alpha = alpha
         self.__policy_clip = policy_clip
         self.__chkpt_dir = chkpt_dir
         self.train_mode = False
         self.__top_speed = top_speed
-        self.__buffer = Buffer(batch_size, max_buffer_length, gamma, gae_lambda)
-        self.__neural_network = Networks(self.__alpha)
+        self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda)
+        self.__neural_network = Models(actor_alpha, critic_alpha)
         self.__training_index = 0  # Track batch index during training
         self.__current_batch = None  # Saving of the current batch which is in process
-        self.__std_dev = 0.9
-        self.n_epochs = 10
+        self.__std_dev = 0.05
+        self.n_epochs = 3
         self.done = False
         self.action = None
         self.value = None
@@ -139,7 +141,7 @@ def store_transition(
             reward: The reward received.
             done: Indicating whether the target sequence has been reached.
         """
-        self.__buffer.store_memory(state, action, probs, value, reward, done)
+        self.__memory.store_memory(state, action, probs, value, reward, done)
 
     def save_models(self):
         """Saves the models in the specified file."""
@@ -250,7 +252,7 @@ def update(self, robot_node):
 
         # Checks whether the sequence has ended if it is set to Training mode.
         if (self.train_mode is True) and (
-            (self.done is True) or (self.__buffer.is_memory_full() is True)
+            (self.done is True) or (self.__memory.is_memory_full() is True)
         ):
             cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE)
             self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
@@ -277,11 +279,13 @@ def update(self, robot_node):
             if self.data_sent is False:
                 self.unsent_data.append(("SPEED_SET", motorcontrol))
 
+            self.reinitialize(robot_node)
             self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
 
             # Failed to send data. Appends the data to unsent_data List
             if self.data_sent is False:
                 self.unsent_data.append(("CMD", cmd_payload))
+            self.state = "IDLE"
 
     def normalize_sensor_data(self, sensor_data):
         """
@@ -314,29 +318,26 @@ def determine_reward(self, sensor_data):
             float: the Resulting Reward
 
         """
-        reward = self.__buffer.calculate_reward(sensor_data)
+        reward = self.__memory.calculate_reward(sensor_data)
         return reward
 
     # pylint: disable=too-many-arguments
     # pylint: disable=too-many-locals
-    def learn(self, states, actions, old_probs, values, rewards, dones):
+    def learn(self, states, actions, old_probs, values, rewards, advantages):
         """
         Perform training to optimize model weights.
 
         Parameters
         ----------
-            states:    The saved states observed during interactions with the environment.
-            actions:   The saved actions taken in response to the observed states.
-            old_probs: The saved probabilities of the actions taken, based on the previous policy.
-            values:    The saved estimated values of the observed states.
-            rewards:   The saved rewards received for taking the actions.
-            dones:     The saved flags indicating whether the target sequence or episode has 
-                        been completed.
+            states:     The saved states observed during interactions with the environment.
+            actions:    The saved actions taken in response to the observed states.
+            old_probs:  The saved probabilities of the actions taken, based on the previous policy.
+            values:     The saved estimated values of the observed states.
+            rewards:    The saved rewards received for taking the actions.
+            advantages: the computed advantage values for each state in a given Data size. 
         """
-        for _ in range(self.n_epochs):
 
-            #the computed advantage values for each state in a given batch of experiences.
-            advantages = self.__buffer.calculate_advantages(rewards, values, dones)
+        for _ in range(self.n_epochs):
 
             # optimize Actor Network weights
             with tf.GradientTape() as tape:
@@ -407,8 +408,8 @@ def learn(self, states, actions, old_probs, values, rewards, dones):
                 self.critic_loss_history.append(critic_loss.numpy())
                 self.reward_history.append(sum(rewards))
 
-                # saving logs in a CSV file
-                self.save_logs_to_csv()
+            # saving logs in a CSV file
+            self.save_logs_to_csv()
 
     def save_logs_to_csv(self):
         """Function for saving logs in a CSV file"""
@@ -436,7 +437,7 @@ def perform_training(self):
         if self.__current_batch is None:
 
             # Grab sample from memory
-            self.__current_batch = self.__buffer.generate_batches()
+            self.__current_batch = self.__memory.generate_batches()
 
         # Perform training with mini batches.
         if self.__training_index < len(self.__current_batch[-1]):
@@ -446,18 +447,19 @@ def perform_training(self):
                 old_prob_arr,
                 vals_arr,
                 reward_arr,
-                dones_arr,
+                advatage_arr,
                 batches,
             ) = self.__current_batch
             batch = batches[self.__training_index]
 
+            # pylint: disable=too-many-arguments
             self.learn(
                 state_arr[batch],
                 action_arr[batch],
                 old_prob_arr[batch],
                 vals_arr[batch],
                 reward_arr[batch],
-                dones_arr[batch],
+                advatage_arr[batch]
             )
             self.__training_index += 1
 
@@ -466,7 +468,7 @@ def perform_training(self):
             self.__training_index = 0
             self.__current_batch = None
             self.done = False
-            self.__buffer.clear_memory()
+            self.__memory.clear_memory()
             self.state = "IDLE"
             self.num_episodes += 1
             cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index f3624593..1535330c 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -28,9 +28,9 @@
 # Imports
 ################################################################################
 
-import tensorflow as tf  # pylint: disable=import-error
-from tensorflow.keras import layers  # pylint: disable=import-error
-from tensorflow.keras.regularizers import l2  # pylint: disable=import-error
+from tensorflow import keras  # pylint: disable=import-error
+from keras import layers  # pylint: disable=import-error
+from keras.regularizers import l2  # pylint: disable=import-error
 
 ################################################################################
 # Variables
@@ -42,15 +42,16 @@
 ################################################################################
 
 
-class Networks:
-    """Class for building networks of actors and critics"""
+class Models:
+    """Class for building networks of actors and critics."""
 
-    def __init__(self, alpha):
-        self.__learning_rate = alpha
+    def __init__(self, actor_alpha, critic_alpha):
+        self.__actor_learning_rate = actor_alpha
+        self.__critic_learning_rate = critic_alpha
         self.actor_network = self.build_actor_network()
         self.critic_network = self.build_critic_network()
-        self.actor_optimizer = tf.keras.optimizers.Adam(self.__learning_rate)
-        self.critic_optimizer = tf.keras.optimizers.Adam(self.__learning_rate)
+        self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate)
+        self.critic_optimizer = keras.optimizers.Adam(self.__critic_learning_rate)
 
     def build_actor_network(self):
         """Build Actor Network."""
@@ -84,7 +85,7 @@ def build_actor_network(self):
             bias_initializer="zeros",
         )(fc3)
 
-        return tf.keras.models.Model(inputs=state_input, outputs=mean)
+        return keras.models.Model(inputs=state_input, outputs=mean)
 
     def build_critic_network(self):
         """Build Critic Network"""
@@ -113,7 +114,7 @@ def build_critic_network(self):
         )(fc2)
         value = layers.Dense(1)(fc3)  # Value output
 
-        return tf.keras.models.Model(inputs=state_input, outputs=value)
+        return keras.models.Model(inputs=state_input, outputs=value)
 
 
 ################################################################################
diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index f5f7ac8d..b620956a 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -64,7 +64,7 @@
 
 MODE_CHANNEL_NAME = "MODE"
 
-MIN_NUMBER_OF_STEPS = 200
+MIN_NUMBER_OF_STEPS = 400
 SENSOR_ID_MOST_LEFT = 0
 SENSOR_ID_MOST_RIGHT = 4
 
@@ -76,8 +76,8 @@
 ################################################################################
 
 
-class RobotController:
-    """Class for data flow control logic"""
+class RobotController:   # pylint: disable=too-many-instance-attributes
+    """Class for data flow control logic."""
 
     def __init__(self, smp_server, tick_size, agent):
         self.__smp_server = smp_server
@@ -86,6 +86,7 @@ def __init__(self, smp_server, tick_size, agent):
         self.__no_line_detection_count = 0
         self.__timestamp = 0  # Elapsed time since reset [ms]
         self.last_sensor_data = None
+        self.start_stop_line_detected = False
         self.steps = 0
 
     def callback_status(self, payload: bytearray) -> None:
@@ -94,8 +95,7 @@ def callback_status(self, payload: bytearray) -> None:
         # perform action on robot status feedback
         if payload[0] == STATUS_CHANNEL_ERROR_VAL:
             print("robot has reached error-state (max. lap time passed in robot)")
-
-            self.__agent.done = 1
+            self.__agent.done = True
 
     def callback_line_sensors(self, payload: bytearray) -> None:
         """Callback LINE_SENS Channel."""
@@ -109,21 +109,24 @@ def callback_line_sensors(self, payload: bytearray) -> None:
             self.__no_line_detection_count = 0
 
         # Detect start/stop line
-        is_start_stop = all(
-            value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data
-        )
+        if ((sensor_data[SENSOR_ID_MOST_LEFT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE) and
+            (sensor_data[SENSOR_ID_MOST_RIGHT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE)):
+            self.start_stop_line_detected = True
+
         # Detect Start/Stop Line before Finish Trajectories
-        if (is_start_stop is True) and (self.steps < MIN_NUMBER_OF_STEPS):
+        if (self.start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS):
             sensor_data = list(sensor_data)
             sensor_data[SENSOR_ID_MOST_LEFT] = 0
             sensor_data[SENSOR_ID_MOST_RIGHT] = 0
+            self.start_stop_line_detected = False
 
         # sequence stop criterion debounce no line detection and start/stop line detected
         if self.__no_line_detection_count >= 30 or (
-            is_start_stop and (self.steps >= MIN_NUMBER_OF_STEPS)
+            (self.start_stop_line_detected is True) and (self.steps >= MIN_NUMBER_OF_STEPS)
         ):
             self.__agent.done = True
             self.__no_line_detection_count = 0
+            self.steps = 0
 
         # The sequence of states and actions is stored in memory for the training phase.
         if self.__agent.train_mode:
@@ -136,7 +139,7 @@ def callback_line_sensors(self, payload: bytearray) -> None:
 
             # Start storage The data after the second received sensor data
             if self.last_sensor_data is not None:
-                normalized_sensor_data = self.__agent.normalize_sensor_data(sensor_data)
+                normalized_sensor_data = self.__agent.normalize_sensor_data(self.last_sensor_data)
                 self.__agent.store_transition(
                     normalized_sensor_data,
                     self.__agent.action,
@@ -278,7 +281,6 @@ def main_loop():
             # Start the training
             elif agent.state == "TRAINING":
                 supervisor.last_sensor_data = None
-                controller.steps = 0
                 agent.perform_training()
 
                 print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f},"
diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
index 44f17fce..151dcf52 100644
--- a/webots/controllers/RL_Supervisor/trajectory_buffer.py
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -40,8 +40,8 @@
 ################################################################################
 
 
-class Buffer:  # pylint: disable=too-many-instance-attributes
-    """Class for store and manage experience tuples during reinforcement learning"""
+class Memory:  # pylint: disable=too-many-instance-attributes
+    """Class for store and manage experience tuples during Reinforcement learning."""
 
     # pylint: disable=too-many-arguments
     def __init__(self, batch_size, max_length, gamma, gae_lambda):
@@ -51,6 +51,7 @@ def __init__(self, batch_size, max_length, gamma, gae_lambda):
         self.__actions = []
         self.__rewards = []
         self.__dones = []
+        self.__advatages = []
         self.__batch_size = batch_size
         self.__max_length = max_length
         self.__batch_size = batch_size
@@ -85,14 +86,18 @@ def generate_batches(self):
         # Create batches by dividing the indices into groups of the batch_size
         batches = [indices[indx : indx + self.__batch_size] for indx in batch_start]
 
+        # the computed advantage values for each state in a given Data size.
+        self.__advatages = self.calculate_advantages(self.__rewards,
+                                                    self.__vals, self.__dones)
         return (
             np.array(self.__states),
             np.array(self.__actions),
             np.array(self.__probs),
             np.array(self.__vals),
             np.array(self.__rewards),
-            np.array(self.__dones),
+            np.array(self.__advatages),
             batches,
+
         )
 
     def get_sum_rewards(self) -> float:
@@ -139,6 +144,8 @@ def clear_memory(self):
         self.__actions = []
         self.__rewards = []
         self.__dones = []
+        self.__advatages = []
+        self.__current_index = 0
 
     def is_memory_full(self):
         """
@@ -148,8 +155,11 @@ def is_memory_full(self):
         ----------
         - Bool: Memory is full or not
         """
+        is_full = False
+
+        if self.__current_index >= self.__max_length:
+            is_full = True
 
-        is_full = self.__current_index >= self.__max_length
         return is_full
 
     def calculate_advantages(self, rewards, values, dones):
@@ -167,19 +177,19 @@ def calculate_advantages(self, rewards, values, dones):
         Returns
         ----------
             NumPy array of float32: the computed advantage values for each 
-            state in a given batch of experiences.
+            state in a given Data size.
         """
 
-        mini_batch_size = rewards.shape[0]
+        data_size = len(rewards)
 
         # Create empty advantages array
-        advantages = np.zeros(mini_batch_size, dtype=np.float32)
+        advantages = np.zeros(data_size, dtype=np.float32)
 
-        for start_index in range(mini_batch_size):
+        for start_index in range(data_size-1):
             discount = 1
             advantage = 0
 
-            for future_index in range(start_index, mini_batch_size - 1):
+            for future_index in range(start_index, data_size - 1):
 
                 # Calculate the temporal difference (TD)
                 delta = (
@@ -198,10 +208,6 @@ def calculate_advantages(self, rewards, values, dones):
                 # Update the discount factor for the next step
                 discount *= self.__gamma * self.__gae_lambda
 
-                # Stop if a terminal state is reached
-                if dones[future_index]:
-                    break
-
             # Save the calculated advantage for the current state
             advantages[start_index] = advantage
 

From f8f5bc22376d12bcfd5e8d4f900382eba925b0bd Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 3 Sep 2024 12:55:47 +0200
Subject: [PATCH 21/37] Correct return type of getSelectedMode function in
 ReadyState class

---
 .../src/ReadyState.cpp                        |  2 +-
 lib/APPReinforcementLearning/src/ReadyState.h | 23 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp
index fc8babaa..415d6f7f 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.cpp
+++ b/lib/APPReinforcementLearning/src/ReadyState.cpp
@@ -140,7 +140,7 @@ void ReadyState::setLapTime(uint32_t lapTime)
     m_lapTime            = lapTime;
 }
 
-uint8_t ReadyState::getSelectedMode()
+ReadyState::Mode ReadyState::getSelectedMode()
 {
     return m_mode;
 }
diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h
index 8626a46a..93949f32 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.h
+++ b/lib/APPReinforcementLearning/src/ReadyState.h
@@ -59,6 +59,16 @@
 class ReadyState : public IState
 {
 public:
+    /**
+     * The mode that can be selected.
+     */
+    enum Mode: uint8_t
+    {
+        IDLE = 0,     /**< No mode has been selected*/
+        DRIVING_MODE, /**< Driving mode Selected. */
+        TRAINING_MODE /**< Training mode Selected. */
+    };
+
     /**
      * Get state instance.
      *
@@ -100,21 +110,12 @@ class ReadyState : public IState
     /**
      *  Set the selected mode.
      *
-     * @return It returns 1 if DrivingMode is selected or 2 if TrainingMode is selected.
+     * @return It returns 1 if DrivingMode is selected or 2 if TrainingMode is selected, otherwise 0.
      */
-    uint8_t getSelectedMode();
+    Mode getSelectedMode();
 
 protected:
 private:
-    /**
-     * The mode that can be selected.
-     */
-    enum Mode
-    {
-        IDLE = 0,     /**< No mode has been selected*/
-        DRIVING_MODE, /**< Driving mode Selected. */
-        TRAINING_MODE /**< Training mode Selected. */
-    };
 
     /** Duration of the selected mode in ms. This is the maximum time to select a mode. */
     static const uint32_t MODE_SELECTED_PERIOD = 2000U;

From b3996880c7acf6e0490ea4a34ed14df393ac7acf Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 9 Sep 2024 17:03:03 +0200
Subject: [PATCH 22/37] improve code formatting and update comments for clarity

---
 lib/APPReinforcementLearning/src/ReadyState.h    | 2 +-
 lib/APPReinforcementLearning/src/TrainingState.h | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h
index 93949f32..bb827e55 100644
--- a/lib/APPReinforcementLearning/src/ReadyState.h
+++ b/lib/APPReinforcementLearning/src/ReadyState.h
@@ -110,7 +110,7 @@ class ReadyState : public IState
     /**
      *  Set the selected mode.
      *
-     * @return It returns 1 if DrivingMode is selected or 2 if TrainingMode is selected, otherwise 0.
+     * @return It returns the selected Mode.
      */
     Mode getSelectedMode();
 
diff --git a/lib/APPReinforcementLearning/src/TrainingState.h b/lib/APPReinforcementLearning/src/TrainingState.h
index 737d7a89..a5ad381f 100644
--- a/lib/APPReinforcementLearning/src/TrainingState.h
+++ b/lib/APPReinforcementLearning/src/TrainingState.h
@@ -131,7 +131,3 @@ class TrainingState : public IState
 
 #endif /* TRAINING_STATE_H */
 /** @} */
-
-/******************************************************************************
- * Local Functions
- *****************************************************************************/

From 0260fadac1e9b9d3bd77d17847e1098882a92447 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 9 Sep 2024 17:04:36 +0200
Subject: [PATCH 23/37] Add conditions to verify and identify failed data

---
 lib/APPReinforcementLearning/src/App.cpp | 32 ++++++++++++++++++------
 lib/APPReinforcementLearning/src/App.h   |  2 +-
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index a4cf8cb9..f7a9e725 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -96,7 +96,9 @@ void App::loop()
 {
     Board::getInstance().process();
     Speedometer::getInstance().process();
-    bool  isDataSent = true;
+    bool  isSensorDataSent = true;
+    bool  isModeDataSent   = true;
+    bool  isStatusDataSent = true;
 
     if (true == m_controlInterval.isTimeout())
     {
@@ -125,7 +127,7 @@ void App::loop()
             payload = {SMPChannelPayload::Status::DONE};
         }
 
-        isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
+        isStatusDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
 
         m_statusTimer.restart();
     }
@@ -134,7 +136,7 @@ void App::loop()
     if (true == m_sendLineSensorsDataInterval.isTimeout() &&
         (&DrivingState::getInstance() == m_systemStateMachine.getState()))
     {
-        isDataSent = sendLineSensorsData();
+        isSensorDataSent = sendLineSensorsData();
 
         m_sendLineSensorsDataInterval.restart();
     }
@@ -149,16 +151,30 @@ void App::loop()
             SMPChannelPayload::Mode payload =
                 (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
 
-            isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
+            isModeDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
 
             m_modeSelectionSent = true;
         }
     }
 
-    if (false == isDataSent)
+    if (false == isSensorDataSent)
     {
-        /* Failed to send data to the supervisor. Go to error state. */
-        ErrorState::getInstance().setErrorMsg("DSF");
+        /* Failed to send Senssor data to the supervisor. Go to error state. */
+        ErrorState::getInstance().setErrorMsg("SEND_SF");
+        m_systemStateMachine.setState(&ErrorState::getInstance());
+    }
+
+    if (false == isStatusDataSent)
+    {
+        /* Failed to send Status data to the supervisor. Go to error state. */
+        ErrorState::getInstance().setErrorMsg("SD_SF");
+        m_systemStateMachine.setState(&ErrorState::getInstance());
+    }
+
+    if (false == isModeDataSent)
+    {
+        /* Failed to send Mode data to the supervisor. Go to error state. */
+        ErrorState::getInstance().setErrorMsg("MD_SF");
         m_systemStateMachine.setState(&ErrorState::getInstance());
     }
 
@@ -208,7 +224,7 @@ bool App::sendLineSensorsData() const
     uint8_t         maxLineSensors   = lineSensors.getNumLineSensors();
     const uint16_t* lineSensorValues = lineSensors.getSensorValues();
     uint8_t         lineSensorIdx    = 0U;
-    bool            isPayloadSent       = true;
+    bool            isPayloadSent    = true;
     LineSensorData  payload;
 
     if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t)))
diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h
index 6fec26c9..c15d3281 100644
--- a/lib/APPReinforcementLearning/src/App.h
+++ b/lib/APPReinforcementLearning/src/App.h
@@ -151,7 +151,7 @@ class App
     /**
      * Send line sensors data via SerialMuxProt.
      * 
-     * * @return true if data has been sent, false otherwise.
+     * @return true if data has been sent, false otherwise.
      */
     bool sendLineSensorsData() const;
 

From 2add9fa863045fe234c37b78b3cce95b753edb22 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 9 Sep 2024 17:17:16 +0200
Subject: [PATCH 24/37] refactor: enhance clarity in RL_Supervisor, Agent, and
 networks code

---
 webots/controllers/RL_Supervisor/agent.py     | 164 +++++++-----------
 webots/controllers/RL_Supervisor/networks.py  | 133 +++++++++++++-
 .../RL_Supervisor/rl_supervisor.py            |  56 +++---
 3 files changed, 212 insertions(+), 141 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index f45ef80a..20ef40ac 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -27,14 +27,15 @@
 ################################################################################
 # Imports
 ################################################################################
+
 import csv
 import os
 import struct
-import numpy as np  # pylint: disable=import-error
-import tensorflow as tf  # pylint: disable=import-error
-import tensorflow_probability as tfp  # pylint: disable=import-error
-from trajectory_buffer import Memory
-from networks import Models
+import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
+from   trajectory_buffer import Memory
+from   networks import Models
 
 ################################################################################
 # Variables
@@ -68,6 +69,16 @@
 MIN_STD_DEV = 0.01  # Minimum standard deviation
 STD_DEV_FACTOR = 0.995  # Discounter standard deviation factor
 
+TRANSLATION_FIELD = "translation"
+ROTATION_FIELD = "rotation"
+
+IDLE = "IDLE_STATE"
+READY = "READY_STATE"
+TRAINING = "TRAINING_STATE"
+
+DIRECTORY = "logs"
+FILE_DIRECTORY = "training_logs.csv"
+
 ################################################################################
 # Classes
 ################################################################################
@@ -94,37 +105,34 @@ def __init__(
         max_buffer_length=65536,
     ):
         self.__serialmux = smp_server
-        self.__policy_clip = policy_clip
         self.__chkpt_dir = chkpt_dir
         self.train_mode = False
         self.__top_speed = top_speed
+        self.__std_dev = 0.05
         self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda)
-        self.__neural_network = Models(actor_alpha, critic_alpha)
+        self.__neural_network = Models(actor_alpha, critic_alpha, self.__std_dev, policy_clip)
         self.__training_index = 0  # Track batch index during training
         self.__current_batch = None  # Saving of the current batch which is in process
-        self.__std_dev = 0.05
         self.n_epochs = 3
         self.done = False
         self.action = None
         self.value = None
         self.adjusted_log_prob = None
         self.num_episodes = 0
-        self.state = "IDLE"
+        self.state = IDLE
         self.data_sent = True
         self.unsent_data = []
-        self.critic_loss_history = []
-        self.actor_loss_history = []
         self.reward_history = []
 
     def set_train_mode(self):
         """Set the Agent mode to train mode."""
         self.train_mode = True
-        self.state = "READY"
+        self.state = READY
 
     def set_drive_mode(self):
         """Set the Agent mode to drive mode."""
         self.train_mode = False
-        self.state = "READY"
+        self.state = READY
         self.num_episodes = 0
 
     def store_transition(
@@ -235,11 +243,11 @@ def send_motor_speeds(self, state):
         right_motor_speed = int(self.__top_speed + speed_difference)
 
         control_data = struct.pack("2H", left_motor_speed, right_motor_speed)
-        self.data_sent = self.__serialmux.send_data("SPEED_SET", control_data)
+        self.data_sent = self.__serialmux.send_data(SPEED_SET_CHANNEL_NAME, control_data)
 
         # Failed to send data. Appends the data to unsent_data List.
         if self.data_sent is False:
-            self.unsent_data.append(("SPEED_SET", control_data))
+            self.unsent_data.append((SPEED_SET_CHANNEL_NAME, control_data))
 
     def update(self, robot_node):
         """
@@ -251,19 +259,19 @@ def update(self, robot_node):
         """
 
         # Checks whether the sequence has ended if it is set to Training mode.
-        if (self.train_mode is True) and (
-            (self.done is True) or (self.__memory.is_memory_full() is True)
-        ):
+        if self.train_mode is True  and (
+            self.done is True or self.__memory.is_memory_full() is True):
+
             cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE)
-            self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
+            self.data_sent = self.__serialmux.send_data(COMMAND_CHANNEL_NAME, cmd_payload)
 
             # Failed to send data. Appends the data to unsent_data List.
             if self.data_sent is False:
-                self.unsent_data.append(("CMD", cmd_payload))
+                self.unsent_data.append((COMMAND_CHANNEL_NAME, cmd_payload))
 
             # Stopping condition for sequence was reached.
             self.reinitialize(robot_node)
-            self.state = "TRAINING"
+            self.state = TRAINING
 
         # Checks whether the sequence has ended if it is set to driving mode.
         if (self.train_mode is False) and (self.done is True):
@@ -272,20 +280,20 @@ def update(self, robot_node):
             cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
 
             self.data_sent = self.__serialmux.send_data(
-                "SPEED_SET", motorcontrol
+                SPEED_SET_CHANNEL_NAME, motorcontrol
             )  # stop the motors immediately
 
             # Failed to send data. Appends the data to unsent_data List
             if self.data_sent is False:
-                self.unsent_data.append(("SPEED_SET", motorcontrol))
+                self.unsent_data.append((SPEED_SET_CHANNEL_NAME, motorcontrol))
 
             self.reinitialize(robot_node)
-            self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
+            self.data_sent = self.__serialmux.send_data(COMMAND_CHANNEL_NAME, cmd_payload)
 
             # Failed to send data. Appends the data to unsent_data List
             if self.data_sent is False:
-                self.unsent_data.append(("CMD", cmd_payload))
-            self.state = "IDLE"
+                self.unsent_data.append((COMMAND_CHANNEL_NAME, cmd_payload))
+            self.state = IDLE
 
     def normalize_sensor_data(self, sensor_data):
         """
@@ -300,8 +308,8 @@ def normalize_sensor_data(self, sensor_data):
             NumPy array of float32: Normalized Sensor Data
         """
 
-        sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE
-        return sensor_data
+        normalized_sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE
+        return normalized_sensor_data
 
     def determine_reward(self, sensor_data):
         """
@@ -315,14 +323,12 @@ def determine_reward(self, sensor_data):
 
         Returns
         ----------
-            float: the Resulting Reward
+            float: The Resulting Reward.
 
         """
         reward = self.__memory.calculate_reward(sensor_data)
         return reward
 
-    # pylint: disable=too-many-arguments
-    # pylint: disable=too-many-locals
     def learn(self, states, actions, old_probs, values, rewards, advantages):
         """
         Perform training to optimize model weights.
@@ -336,88 +342,34 @@ def learn(self, states, actions, old_probs, values, rewards, advantages):
             rewards:    The saved rewards received for taking the actions.
             advantages: the computed advantage values for each state in a given Data size. 
         """
+        # scales the sensor data to a range between 0 and 1
+        m_states = self.normalize_sensor_data(states)
 
         for _ in range(self.n_epochs):
 
-            # optimize Actor Network weights
-            with tf.GradientTape() as tape:
-                states = tf.convert_to_tensor(states)
-                actions = tf.convert_to_tensor(actions)
-                old_probs = tf.convert_to_tensor(old_probs)
-
-                # Forward pass through the actor network to get the action mean
-                predict_mean = self.__neural_network.actor_network(states)
-
-                # Create the normal distribution with the predicted mean
-                new_dist = tfp.distributions.Normal(predict_mean, self.__std_dev)
-
-                # Invert the tanh transformation to recover the original actions before tanh
-                untransformed_actions = tf.atanh(actions)
-
-                new_log_prob = new_dist.log_prob(untransformed_actions)
+            states = tf.convert_to_tensor(m_states)
+            actions = tf.convert_to_tensor(actions)
+            old_probs = tf.convert_to_tensor(old_probs)
 
-                # Compute the log of the Jacobian for the tanh transformation
-                jacobian_log_det = tf.math.log(1 - tf.square(actions))
-                adjusted_new_log_prob = new_log_prob - jacobian_log_det
-
-                # The ratio between the new model and the old model’s action log probabilities
-                prob_ratio = tf.exp(adjusted_new_log_prob - old_probs)
-
-                # If the ratio is too large or too small, it will be
-                # clipped according to the surrogate function.
-                weighted_probs = prob_ratio * advantages
-                clipped_probs = tf.clip_by_value(
-                    prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip
-                )
-                weighted_clipped_probs = clipped_probs * advantages
-
-                # Policy Gradient Loss
-                actor_loss = -tf.reduce_mean(
-                    tf.minimum(weighted_probs, weighted_clipped_probs)
-                )
-
-            # calculate gradient
-            actor_params = self.__neural_network.actor_network.trainable_variables
-            actor_grads = tape.gradient(actor_loss, actor_params)
-            self.__neural_network.actor_optimizer.apply_gradients(
-                zip(actor_grads, actor_params)
-            )
+            # optimize Actor Network weights
+            self.__neural_network.compute_actor_gradient(states, actions, old_probs, advantages)
 
             # optimize Critic Network weights
-            with tf.GradientTape() as tape:
-
-                # The critical value represents the expected return from state 𝑠𝑡.
-                # It provides an estimate of how good it is to be in a given state.
-                critic_value = self.__neural_network.critic_network(states)
-
-                # the total discounted reward accumulated from time step 𝑡
-                estimate_returns = advantages + values
+            self.__neural_network.compute_critic_gradient(states, values, advantages)
 
-                # Generate loss
-                critic_loss = tf.math.reduce_mean(
-                    tf.math.pow(estimate_returns - critic_value, 2)
-                )
-
-                # calculate gradient
-                critic_params = self.__neural_network.critic_network.trainable_variables
-                critic_grads = tape.gradient(critic_loss, critic_params)
-                self.__neural_network.critic_optimizer.apply_gradients(
-                    zip(critic_grads, critic_params)
-                )
-                self.actor_loss_history.append(actor_loss.numpy())
-                self.critic_loss_history.append(critic_loss.numpy())
-                self.reward_history.append(sum(rewards))
+            # Save the rewards received
+            self.reward_history.append(sum(rewards))
 
-            # saving logs in a CSV file
-            self.save_logs_to_csv()
+        # saving logs in a CSV file
+        self.save_logs_to_csv()
 
     def save_logs_to_csv(self):
         """Function for saving logs in a CSV file"""
 
         # Ensure the directory exists
-        log_dir = "logs"
+        log_dir = DIRECTORY
         os.makedirs(log_dir, exist_ok=True)
-        log_file = os.path.join(log_dir, "training_logs.csv")
+        log_file = os.path.join(log_dir, FILE_DIRECTORY)
 
         with open(log_file, mode="w", encoding="utf-8", newline="") as file:
             writer = csv.writer(file)
@@ -425,8 +377,8 @@ def save_logs_to_csv(self):
             for indx, reward in enumerate(self.reward_history):
                 writer.writerow(
                     [
-                        self.actor_loss_history[indx],
-                        self.critic_loss_history[indx],
+                        self.__neural_network.actor_loss_history[indx],
+                        self.__neural_network.critic_loss_history[indx],
                         reward,
                     ]
                 )
@@ -469,14 +421,14 @@ def perform_training(self):
             self.__current_batch = None
             self.done = False
             self.__memory.clear_memory()
-            self.state = "IDLE"
+            self.state = IDLE
             self.num_episodes += 1
             cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE)
-            self.data_sent = self.__serialmux.send_data("CMD", cmd_payload)
+            self.data_sent = self.__serialmux.send_data(COMMAND_CHANNEL_NAME, cmd_payload)
 
             # Failed to send data. Appends the data to unsent_data List
             if self.data_sent is False:
-                self.unsent_data.append(("CMD", cmd_payload))
+                self.unsent_data.append((COMMAND_CHANNEL_NAME, cmd_payload))
 
             # Minimize standard deviation until the minimum standard deviation is reached
             self.__std_dev = self.__std_dev * STD_DEV_FACTOR
@@ -490,8 +442,8 @@ def reinitialize(self, robot_node):
         ----------
             robot_node: The Robot interface
         """
-        trans_field = robot_node.getField("translation")
-        rot_field = robot_node.getField("rotation")
+        trans_field = robot_node.getField(TRANSLATION_FIELD)
+        rot_field = robot_node.getField(ROTATION_FIELD)
         initial_position = POSITION_DATA
         initial_orientation = ORIENTATION_DATA
         trans_field.setSFVec3f(initial_position)
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index 1535330c..9deee19b 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -28,35 +28,43 @@
 # Imports
 ################################################################################
 
-from tensorflow import keras  # pylint: disable=import-error
-from keras import layers  # pylint: disable=import-error
-from keras.regularizers import l2  # pylint: disable=import-error
+import tensorflow as tf
+import tensorflow_probability as tfp
+from tensorflow import keras
+from keras import layers
+from keras.regularizers import l2
 
 ################################################################################
 # Variables
 ################################################################################
 
+# Constants
+NUM_SENSORS = 5  # Assuming 5 sensor inputs
 
 ################################################################################
 # Classes
 ################################################################################
 
 
-class Models:
+class Models:  # pylint: disable=too-many-instance-attributes
     """Class for building networks of actors and critics."""
 
-    def __init__(self, actor_alpha, critic_alpha):
+    def __init__(self, actor_alpha, critic_alpha, std_dev, policy_clip):
         self.__actor_learning_rate = actor_alpha
         self.__critic_learning_rate = critic_alpha
         self.actor_network = self.build_actor_network()
-        self.critic_network = self.build_critic_network()
+        self.critic_network = self.build_actor_network()
+        self.std_dev = std_dev
+        self.policy_clip = policy_clip
         self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate)
         self.critic_optimizer = keras.optimizers.Adam(self.__critic_learning_rate)
+        self.critic_loss_history = []
+        self.actor_loss_history = []
 
     def build_actor_network(self):
         """Build Actor Network."""
 
-        state_input = layers.Input(shape=(5,))  # Assuming 5 sensor inputs
+        state_input = layers.Input(shape=(NUM_SENSORS,))
         fc1 = layers.Dense(
             64,
             activation="relu",
@@ -90,7 +98,7 @@ def build_actor_network(self):
     def build_critic_network(self):
         """Build Critic Network"""
 
-        state_input = layers.Input(shape=(5,))  # Assuming 5 sensor inputs
+        state_input = layers.Input(shape=(NUM_SENSORS,))
         fc1 = layers.Dense(
             64,
             activation="relu",
@@ -116,6 +124,115 @@ def build_critic_network(self):
 
         return keras.models.Model(inputs=state_input, outputs=value)
 
+    def compute_critic_gradient(self, states, values, advantages):
+        """ optimize Critic Network weights.
+        
+        Parameters
+        ----------
+            states:     The saved states observed during interactions with the environment.
+            values:     The saved estimated values of the observed states.
+            advantages: the computed advantage values for each state in a given Data size. 
+        """
+
+        with tf.GradientTape() as tape:
+
+            # The critical value represents the expected return from state 𝑠𝑡.
+            # It provides an estimate of how good it is to be in a given state.
+            critic_value = self.critic_network(states)
+
+            # the total discounted reward accumulated from time step 𝑡
+            estimate_returns = advantages + values
+
+            # Generate loss
+            critic_loss = tf.math.reduce_mean(
+                tf.math.pow(estimate_returns - critic_value, 2)
+            )
+            # calculate gradient
+            critic_params = self.critic_network.trainable_variables
+            critic_grads = tape.gradient(critic_loss, critic_params)
+            self.critic_optimizer.apply_gradients(
+                zip(critic_grads, critic_params)
+            )
+
+            # save the critic Loss
+            self.critic_loss_history.append(critic_loss.numpy())
+
+    def calculate_adjusted_log_probability(self, states, actions):
+        """ The function computes the logarithmic probability of a given action.
+
+        Parameters
+        ----------
+            states:  The saved states observed during interactions with the environment.
+            values:  The saved estimated values of the observed states.
+
+        Returns
+        ---------- 
+            A TensorFlow tensor with the data type float32: The logarithmic probability 
+                                                            of a given action
+
+        """
+        # Forward pass through the actor network to get the action mean
+        predict_mean =  self.actor_network(states)
+
+        # Create the normal distribution with the predicted mean
+        new_dist = tfp.distributions.Normal(predict_mean, self.std_dev)
+
+        # Invert the tanh transformation to recover the original actions before tanh
+        untransformed_actions = tf.atanh(actions)
+
+        new_log_prob = new_dist.log_prob(untransformed_actions)
+
+        # Compute the log of the Jacobian for the tanh transformation
+        # adding 1e-6 ensures that the Value remains stable and avoids potential issues
+        # during computation
+        jacobian_log_det = tf.math.log(1 - tf.square(actions) + 1e-6)
+
+        adjusted_log_prob = new_log_prob - jacobian_log_det
+
+        return adjusted_log_prob
+
+    def compute_actor_gradient(self, states, actions, old_probs, advantages):
+        """ optimize Actor Network weights.
+
+        Parameters
+        ----------
+            states:     The saved states observed during interactions with the environment.
+            actions:    The saved actions taken in response to the observed states.
+            old_probs:  The saved probabilities of the actions taken, based on the previous policy.
+            values:     The saved estimated values of the observed states.
+            advantages: the computed advantage values for each state in a given Data size. 
+        """
+
+        with tf.GradientTape() as tape:
+
+            adjusted_new_log_prob = self.calculate_adjusted_log_probability(states, actions)
+
+            # The ratio between the new model and the old model’s action log probabilities
+            prob_ratio = tf.exp(adjusted_new_log_prob - old_probs)
+
+            # If the ratio is too large or too small, it will be
+            # clipped according to the surrogate function.
+            weighted_probs = prob_ratio * advantages
+            clipped_probs = tf.clip_by_value(
+                prob_ratio, 1 - self.policy_clip, 1 + self.policy_clip
+            )
+            weighted_clipped_probs = clipped_probs * advantages
+
+            # Policy Gradient Loss
+            actor_loss = -tf.reduce_mean(
+                tf.minimum(weighted_probs, weighted_clipped_probs)
+            )
+
+        # calculate gradient
+        actor_params = self.actor_network.trainable_variables
+        actor_grads = tape.gradient(actor_loss, actor_params)
+        self.actor_optimizer.apply_gradients(
+            zip(actor_grads, actor_params)
+            )
+
+        # save the Actor Loss
+        self.actor_loss_history.append(actor_loss.numpy())
+
 
 ################################################################################
 # Functions
diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py
index b620956a..e257474c 100644
--- a/webots/controllers/RL_Supervisor/rl_supervisor.py
+++ b/webots/controllers/RL_Supervisor/rl_supervisor.py
@@ -68,6 +68,10 @@
 SENSOR_ID_MOST_LEFT = 0
 SENSOR_ID_MOST_RIGHT = 4
 
+IDLE = "IDLE_STATE"
+READY = "READY_STATE"
+TRAINING = "TRAINING_STATE"
+
 # Path of saved models
 PATH = "models/"
 
@@ -76,7 +80,7 @@
 ################################################################################
 
 
-class RobotController:   # pylint: disable=too-many-instance-attributes
+class RobotController:
     """Class for data flow control logic."""
 
     def __init__(self, smp_server, tick_size, agent):
@@ -86,7 +90,6 @@ def __init__(self, smp_server, tick_size, agent):
         self.__no_line_detection_count = 0
         self.__timestamp = 0  # Elapsed time since reset [ms]
         self.last_sensor_data = None
-        self.start_stop_line_detected = False
         self.steps = 0
 
     def callback_status(self, payload: bytearray) -> None:
@@ -102,6 +105,7 @@ def callback_line_sensors(self, payload: bytearray) -> None:
         sensor_data = struct.unpack("5H", payload)
         self.steps += 1
 
+        is_start_stop_line_detected = False
         # Determine lost line condition
         if all(value == 0 for value in sensor_data):
             self.__no_line_detection_count += 1
@@ -111,19 +115,18 @@ def callback_line_sensors(self, payload: bytearray) -> None:
         # Detect start/stop line
         if ((sensor_data[SENSOR_ID_MOST_LEFT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE) and
             (sensor_data[SENSOR_ID_MOST_RIGHT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE)):
-            self.start_stop_line_detected = True
+            is_start_stop_line_detected = True
 
         # Detect Start/Stop Line before Finish Trajectories
-        if (self.start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS):
+        if (is_start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS):
             sensor_data = list(sensor_data)
-            sensor_data[SENSOR_ID_MOST_LEFT] = 0
+            sensor_data[SENSOR_ID_MOST_LEFT]  = 0
             sensor_data[SENSOR_ID_MOST_RIGHT] = 0
-            self.start_stop_line_detected = False
+            is_start_stop_line_detected       = False
 
         # sequence stop criterion debounce no line detection and start/stop line detected
-        if self.__no_line_detection_count >= 30 or (
-            (self.start_stop_line_detected is True) and (self.steps >= MIN_NUMBER_OF_STEPS)
-        ):
+        if ((self.__no_line_detection_count >= 30) or ((is_start_stop_line_detected is True)
+                                                and  (self.steps >= MIN_NUMBER_OF_STEPS))):
             self.__agent.done = True
             self.__no_line_detection_count = 0
             self.steps = 0
@@ -139,9 +142,8 @@ def callback_line_sensors(self, payload: bytearray) -> None:
 
             # Start storage The data after the second received sensor data
             if self.last_sensor_data is not None:
-                normalized_sensor_data = self.__agent.normalize_sensor_data(self.last_sensor_data)
                 self.__agent.store_transition(
-                    normalized_sensor_data,
+                    self.last_sensor_data,
                     self.__agent.action,
                     self.__agent.adjusted_log_prob,
                     self.__agent.value,
@@ -151,7 +153,7 @@ def callback_line_sensors(self, payload: bytearray) -> None:
             self.last_sensor_data = sensor_data
 
         # Sends the motor speeds to the robot.
-        if self.__agent.done is False and self.__agent.state == "READY":
+        if self.__agent.done is False and self.__agent.state == READY:
             self.__agent.send_motor_speeds(sensor_data)
 
     def callback_mode(self, payload: bytearray) -> None:
@@ -191,6 +193,20 @@ def process(self):
         # process new data (callbacks will be executed)
         self.__smp_server.process(self.__timestamp)
 
+    def manage_agent_cycle(self,robot_node):
+        """The function controls agent behavior"""
+        if self.__agent.state == READY:
+            self.__agent.update(robot_node)
+
+        # Start the training
+        elif self.__agent.state == TRAINING:
+            self.last_sensor_data = None
+            self.__agent.perform_training()
+
+            # save model
+            if (self.__agent.num_episodes > 1) and (self.__agent.num_episodes % 50 == 0):
+                self.__agent.save_models()
+
 
 ################################################################################
 # Functions
@@ -198,7 +214,6 @@ def process(self):
 
 
 # pylint: disable=duplicate-code
-# pylint: disable=too-many-branches
 # pylint: disable=too-many-statements
 def main_loop():
     """Main loop:
@@ -275,20 +290,7 @@ def main_loop():
         while supervisor.step(timestep) != -1:
             controller.process()
 
-            if agent.state == "READY":
-                agent.update(robot_node)
-
-            # Start the training
-            elif agent.state == "TRAINING":
-                supervisor.last_sensor_data = None
-                agent.perform_training()
-
-                print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f},"
-                            f"critic loss: {agent.critic_loss_history[-1]:.4f}")
-
-                # save model
-                if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0):
-                    agent.save_models()
+            controller.manage_agent_cycle(robot_node)
 
             # Resent any unsent Data
             if agent.unsent_data:

From 437d25cf5fd805737e7174cb96602229311c4c4d Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 9 Sep 2024 17:19:35 +0200
Subject: [PATCH 25/37] remove unnecessary pylint disable comment from
 trajectory_buffer

---
 webots/controllers/RL_Supervisor/trajectory_buffer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
index 151dcf52..f39824d8 100644
--- a/webots/controllers/RL_Supervisor/trajectory_buffer.py
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -28,7 +28,7 @@
 # Imports
 ################################################################################
 
-import numpy as np  # pylint: disable=import-error
+import numpy as np
 
 ################################################################################
 # Variables
@@ -70,7 +70,7 @@ def generate_batches(self):
         Numpy-Array: Probs
         Numpy-Array: Vals
         Numpy-Array: Rewards
-        Numpy-Array: dones
+        Numpy-Array: Advantages
         List: Batches
         """
 

From 96c0b4956699b8c10dbf1c7ad94304bd197bac44 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 9 Sep 2024 17:34:07 +0200
Subject: [PATCH 26/37] add pylint disable for import-error due to Python
 3.9/3.10 library compatibility

---
 webots/controllers/RL_Supervisor/agent.py             |  6 +++---
 webots/controllers/RL_Supervisor/networks.py          | 10 +++++-----
 webots/controllers/RL_Supervisor/trajectory_buffer.py |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index 20ef40ac..b65dc0df 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -31,9 +31,9 @@
 import csv
 import os
 import struct
-import numpy as np
-import tensorflow as tf
-import tensorflow_probability as tfp
+import numpy as np                     # pylint: disable=import-error
+import tensorflow as tf                # pylint: disable=import-error
+import tensorflow_probability as tfp   # pylint: disable=import-error
 from   trajectory_buffer import Memory
 from   networks import Models
 
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index 9deee19b..65d67b89 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -28,11 +28,11 @@
 # Imports
 ################################################################################
 
-import tensorflow as tf
-import tensorflow_probability as tfp
-from tensorflow import keras
-from keras import layers
-from keras.regularizers import l2
+import tensorflow as tf               # pylint: disable=import-error
+import tensorflow_probability as tfp  # pylint: disable=import-error
+from tensorflow import keras          # pylint: disable=import-error
+from keras import layers              # pylint: disable=import-error
+from keras.regularizers import l2     # pylint: disable=import-error
 
 ################################################################################
 # Variables
diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
index f39824d8..3a244b28 100644
--- a/webots/controllers/RL_Supervisor/trajectory_buffer.py
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -28,7 +28,7 @@
 # Imports
 ################################################################################
 
-import numpy as np
+import numpy as np      # pylint: disable=import-error
 
 ################################################################################
 # Variables

From f496711b427cad1f2f7492f00311682222996bd8 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Mon, 9 Sep 2024 23:06:25 +0200
Subject: [PATCH 27/37] corrected incorrect variable initialization

---
 webots/controllers/RL_Supervisor/networks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index 65d67b89..7895eb76 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -53,7 +53,7 @@ def __init__(self, actor_alpha, critic_alpha, std_dev, policy_clip):
         self.__actor_learning_rate = actor_alpha
         self.__critic_learning_rate = critic_alpha
         self.actor_network = self.build_actor_network()
-        self.critic_network = self.build_actor_network()
+        self.critic_network = self.build_critic_network()
         self.std_dev = std_dev
         self.policy_clip = policy_clip
         self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate)

From 6537e80f85c86c025b4264148db28cfe0ef654e6 Mon Sep 17 00:00:00 2001
From: Gabryel Reyes <gabryel.reyes@newtec.de>
Date: Tue, 10 Sep 2024 09:54:53 +0200
Subject: [PATCH 28/37] Use a requirements file to cache pip

---
 .github/workflows/main.yml         |  8 ++++----
 .github/workflows/requirements.txt | 10 ++++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/requirements.txt

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c0652cd0..a0fcdcba 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -209,11 +209,11 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: .github/workflows/requirements.txt
 
       - name: Install dependencies
-        run: |
-          pip install pylint "git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt"
+        run: pip install -r .github/workflows/requirements.txt
 
       - name: Analysing the code with pylint
-        run: |
-          pylint ./webots/controllers/*/*.py
+        run: pylint ./webots/controllers/*/*.py
diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt
new file mode 100644
index 00000000..26d7047e
--- /dev/null
+++ b/.github/workflows/requirements.txt
@@ -0,0 +1,10 @@
+platformio==6.1.15
+numpy==1.26.4
+tensorflow==2.10.0
+tensorflow-probability==0.15.0
+keras==2.10.0
+matplotlib==3.9.0
+pandas==2.2.2
+pytest==8.3.2
+pylint==3.2.7
+git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt
\ No newline at end of file

From 51bb66a117999369b35dba6383de22ea5561d00f Mon Sep 17 00:00:00 2001
From: Gabryel Reyes <gabryel.reyes@newtec.de>
Date: Tue, 10 Sep 2024 10:01:19 +0200
Subject: [PATCH 29/37] Limit python linting to 3.9 and 3.11

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a0fcdcba..b258d3eb 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -198,7 +198,7 @@ jobs:
     needs: intro
     strategy:
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10"]
 
     # Steps represent a sequence of tasks that will be executed as part of the job.
     steps:

From 6463269d19525fdab440c78c6b5733430d96870f Mon Sep 17 00:00:00 2001
From: Gabryel Reyes <gabryel.reyes@newtec.de>
Date: Tue, 10 Sep 2024 10:04:46 +0200
Subject: [PATCH 30/37] Remove disable pylint import errors

---
 webots/controllers/RL_Supervisor/agent.py             | 10 +++++-----
 webots/controllers/RL_Supervisor/networks.py          | 10 +++++-----
 webots/controllers/RL_Supervisor/plotting.py          |  4 ++--
 webots/controllers/RL_Supervisor/trajectory_buffer.py |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index b65dc0df..45769c84 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -31,11 +31,11 @@
 import csv
 import os
 import struct
-import numpy as np                     # pylint: disable=import-error
-import tensorflow as tf                # pylint: disable=import-error
-import tensorflow_probability as tfp   # pylint: disable=import-error
-from   trajectory_buffer import Memory
-from   networks import Models
+import numpy as np
+import tensorflow as tf
+import tensorflow_probability as tfp
+from trajectory_buffer import Memory
+from networks import Models
 
 ################################################################################
 # Variables
diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py
index 7895eb76..064995f0 100644
--- a/webots/controllers/RL_Supervisor/networks.py
+++ b/webots/controllers/RL_Supervisor/networks.py
@@ -28,11 +28,11 @@
 # Imports
 ################################################################################
 
-import tensorflow as tf               # pylint: disable=import-error
-import tensorflow_probability as tfp  # pylint: disable=import-error
-from tensorflow import keras          # pylint: disable=import-error
-from keras import layers              # pylint: disable=import-error
-from keras.regularizers import l2     # pylint: disable=import-error
+import tensorflow as tf
+import tensorflow_probability as tfp
+from tensorflow import keras
+from keras import layers
+from keras.regularizers import l2
 
 ################################################################################
 # Variables
diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py
index 8efb92b3..2b62ba90 100644
--- a/webots/controllers/RL_Supervisor/plotting.py
+++ b/webots/controllers/RL_Supervisor/plotting.py
@@ -6,8 +6,8 @@
 # mini-batch steps.
 
 # Imports
-import matplotlib.pyplot as plt  # pylint: disable=import-error
-import pandas as pd  # pylint: disable=import-error
+import matplotlib.pyplot as plt
+import pandas as pd
 
 # Define the path to the CSV file
 LOG_FILE = "logs/training_logs.csv"
diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py
index 3a244b28..f39824d8 100644
--- a/webots/controllers/RL_Supervisor/trajectory_buffer.py
+++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py
@@ -28,7 +28,7 @@
 # Imports
 ################################################################################
 
-import numpy as np      # pylint: disable=import-error
+import numpy as np
 
 ################################################################################
 # Variables

From eb7643ca936875223642e3c7fec5c3b93ce325a1 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 13:29:15 +0200
Subject: [PATCH 31/37] Refactored conditional logic by replacing with if-else
 blocks for better readability

---
 lib/APPReinforcementLearning/src/App.cpp | 84 ++++++++++++------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index f7a9e725..4833b57b 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -96,9 +96,7 @@ void App::loop()
 {
     Board::getInstance().process();
     Speedometer::getInstance().process();
-    bool  isSensorDataSent = true;
-    bool  isModeDataSent   = true;
-    bool  isStatusDataSent = true;
+    bool  isDataSent = true;
 
     if (true == m_controlInterval.isTimeout())
     {
@@ -127,55 +125,57 @@ void App::loop()
             payload = {SMPChannelPayload::Status::DONE};
         }
 
-        isStatusDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
+        isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
 
         m_statusTimer.restart();
     }
 
-    /* Send periodically line sensor data. */
-    if (true == m_sendLineSensorsDataInterval.isTimeout() &&
-        (&DrivingState::getInstance() == m_systemStateMachine.getState()))
-    {
-        isSensorDataSent = sendLineSensorsData();
-
-        m_sendLineSensorsDataInterval.restart();
-    }
-
-    /* Send Mode selected to The Supervisor. */
-    if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent))
-    {
-        uint8_t mode_options = ReadyState::getInstance().getSelectedMode();
-
-        if (0U < mode_options)
-        {
-            SMPChannelPayload::Mode payload =
-                (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
-
-            isModeDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
-
-            m_modeSelectionSent = true;
-        }
-    }
-
-    if (false == isSensorDataSent)
-    {
-        /* Failed to send Senssor data to the supervisor. Go to error state. */
-        ErrorState::getInstance().setErrorMsg("SEND_SF");
-        m_systemStateMachine.setState(&ErrorState::getInstance());
-    }
-
-    if (false == isStatusDataSent)
+    if (false == isDataSent)
     {
         /* Failed to send Status data to the supervisor. Go to error state. */
         ErrorState::getInstance().setErrorMsg("SD_SF");
         m_systemStateMachine.setState(&ErrorState::getInstance());
     }
-
-    if (false == isModeDataSent)
+    else
     {
-        /* Failed to send Mode data to the supervisor. Go to error state. */
-        ErrorState::getInstance().setErrorMsg("MD_SF");
-        m_systemStateMachine.setState(&ErrorState::getInstance());
+        /* Send periodically line sensor data. */
+        if (true == m_sendLineSensorsDataInterval.isTimeout() &&
+            (&DrivingState::getInstance() == m_systemStateMachine.getState()))
+        {
+            isDataSent = sendLineSensorsData();
+
+            m_sendLineSensorsDataInterval.restart();
+        }
+        if (false == isDataSent)
+        {
+            /* Failed to send Sensor data to the supervisor. Go to error state. */
+            ErrorState::getInstance().setErrorMsg("SEND_SF");
+            m_systemStateMachine.setState(&ErrorState::getInstance());
+        }
+        else
+        {
+            /* Send Mode selected to The Supervisor. */
+            if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent))
+            {
+                uint8_t mode_options = ReadyState::getInstance().getSelectedMode();
+
+                if (0U < mode_options)
+                {
+                    SMPChannelPayload::Mode payload =
+                        (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
+
+                    isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
+
+                    m_modeSelectionSent = true;
+                }
+            }
+            if (false == isDataSent)
+            {
+                /* Failed to send Mode data to the supervisor. Go to error state. */
+                ErrorState::getInstance().setErrorMsg("MD_SF");
+                m_systemStateMachine.setState(&ErrorState::getInstance());
+            }
+        }
     }
 
     m_smpServer.process(millis());

From 3da1583edc3ddf99bbe2edca2dc59716cc363e0d Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 13:40:15 +0200
Subject: [PATCH 32/37] Refactored conditional logic by replacing with if-else
 blocks for better readability

---
 lib/APPReinforcementLearning/src/App.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index 4833b57b..06b4bfe9 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -132,7 +132,7 @@ void App::loop()
 
     if (false == isDataSent)
     {
-        /* Failed to send Status data to the supervisor. Go to error state. */
+        /* Failed to send Status data to the Supervisor. Go to error state. */
         ErrorState::getInstance().setErrorMsg("SD_SF");
         m_systemStateMachine.setState(&ErrorState::getInstance());
     }
@@ -148,7 +148,7 @@ void App::loop()
         }
         if (false == isDataSent)
         {
-            /* Failed to send Sensor data to the supervisor. Go to error state. */
+            /* Failed to send Sensor data to the Supervisor. Go to error state. */
             ErrorState::getInstance().setErrorMsg("SEND_SF");
             m_systemStateMachine.setState(&ErrorState::getInstance());
         }
@@ -171,7 +171,7 @@ void App::loop()
             }
             if (false == isDataSent)
             {
-                /* Failed to send Mode data to the supervisor. Go to error state. */
+                /* Failed to send Mode data to the Supervisor. Go to error state. */
                 ErrorState::getInstance().setErrorMsg("MD_SF");
                 m_systemStateMachine.setState(&ErrorState::getInstance());
             }

From 736d4e88580d8467559ac33622e645f7868ce587 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 15:42:03 +0200
Subject: [PATCH 33/37] Add requiremen.txt for python

---
 webots/controllers/RL_Supervisor/requirements.txt | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 webots/controllers/RL_Supervisor/requirements.txt

diff --git a/webots/controllers/RL_Supervisor/requirements.txt b/webots/controllers/RL_Supervisor/requirements.txt
new file mode 100644
index 00000000..f6c65e91
--- /dev/null
+++ b/webots/controllers/RL_Supervisor/requirements.txt
@@ -0,0 +1,5 @@
+tensorflow
+tensorflow-probability
+tf-keras
+
+git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt
\ No newline at end of file

From 9a0a78b1e8cfafd0d32017f7f6e96bb0958fcc82 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 16:36:18 +0200
Subject: [PATCH 34/37] add fixed package versions in requirements.txt

---
 webots/controllers/RL_Supervisor/requirements.txt | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/webots/controllers/RL_Supervisor/requirements.txt b/webots/controllers/RL_Supervisor/requirements.txt
index f6c65e91..478f3dd3 100644
--- a/webots/controllers/RL_Supervisor/requirements.txt
+++ b/webots/controllers/RL_Supervisor/requirements.txt
@@ -1,5 +1,4 @@
-tensorflow
-tensorflow-probability
-tf-keras
-
+numpy==1.26.4
+tensorflow==2.10.0
+tensorflow-probability==0.15.0
 git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt
\ No newline at end of file

From fc76987cb64f1501f59700445a4fd652cfed5ae1 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 16:53:43 +0200
Subject: [PATCH 35/37] Add more dependencies to requirements.txt Python

---
 webots/controllers/RL_Supervisor/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/webots/controllers/RL_Supervisor/requirements.txt b/webots/controllers/RL_Supervisor/requirements.txt
index 478f3dd3..4b1bc18e 100644
--- a/webots/controllers/RL_Supervisor/requirements.txt
+++ b/webots/controllers/RL_Supervisor/requirements.txt
@@ -1,4 +1,6 @@
+matplotlib==3.9.0
 numpy==1.26.4
+pandas==2.2.2
 tensorflow==2.10.0
 tensorflow-probability==0.15.0
 git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt
\ No newline at end of file

From 277885c6e782b78dbc59ff6778955f5c95c3b025 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 16:55:09 +0200
Subject: [PATCH 36/37] Add a comment in agent.py

---
 webots/controllers/RL_Supervisor/agent.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py
index 45769c84..82dafa93 100644
--- a/webots/controllers/RL_Supervisor/agent.py
+++ b/webots/controllers/RL_Supervisor/agent.py
@@ -108,7 +108,8 @@ def __init__(
         self.__chkpt_dir = chkpt_dir
         self.train_mode = False
         self.__top_speed = top_speed
-        self.__std_dev = 0.05
+        self.__std_dev = 0.05   # When training without an existing model this should be
+                                # set to 0.9 manually
         self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda)
         self.__neural_network = Models(actor_alpha, critic_alpha, self.__std_dev, policy_clip)
         self.__training_index = 0  # Track batch index during training

From 4ae6987d8e0fe679d72524339ff6232207e13519 Mon Sep 17 00:00:00 2001
From: Akram <Akram.Bzeouich@Newtec.de>
Date: Tue, 10 Sep 2024 17:00:57 +0200
Subject: [PATCH 37/37] refactor: change condition to explicit boolean
 comparison

---
 lib/APPReinforcementLearning/src/App.cpp | 62 ++++++++++++------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp
index 06b4bfe9..005193e2 100644
--- a/lib/APPReinforcementLearning/src/App.cpp
+++ b/lib/APPReinforcementLearning/src/App.cpp
@@ -125,28 +125,25 @@ void App::loop()
             payload = {SMPChannelPayload::Status::DONE};
         }
 
-        isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload));
+        if (false == m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)))
+        {
+            /* Failed to send Status data to the Supervisor. Go to error state. */
+            ErrorState::getInstance().setErrorMsg("SD_SF");
+            m_systemStateMachine.setState(&ErrorState::getInstance());
+        }
+        else
+        {
+            m_statusTimer.restart();
+        }
 
-        m_statusTimer.restart();
+        
     }
 
-    if (false == isDataSent)
-    {
-        /* Failed to send Status data to the Supervisor. Go to error state. */
-        ErrorState::getInstance().setErrorMsg("SD_SF");
-        m_systemStateMachine.setState(&ErrorState::getInstance());
-    }
-    else
+    /* Send periodically line sensor data. */
+    if (true == m_sendLineSensorsDataInterval.isTimeout() &&
+        (&DrivingState::getInstance() == m_systemStateMachine.getState()))
     {
-        /* Send periodically line sensor data. */
-        if (true == m_sendLineSensorsDataInterval.isTimeout() &&
-            (&DrivingState::getInstance() == m_systemStateMachine.getState()))
-        {
-            isDataSent = sendLineSensorsData();
-
-            m_sendLineSensorsDataInterval.restart();
-        }
-        if (false == isDataSent)
+        if (false == sendLineSensorsData())
         {
             /* Failed to send Sensor data to the Supervisor. Go to error state. */
             ErrorState::getInstance().setErrorMsg("SEND_SF");
@@ -154,27 +151,30 @@ void App::loop()
         }
         else
         {
-            /* Send Mode selected to The Supervisor. */
-            if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent))
-            {
-                uint8_t mode_options = ReadyState::getInstance().getSelectedMode();
+            m_sendLineSensorsDataInterval.restart();
+        }
+    }
 
-                if (0U < mode_options)
-                {
-                    SMPChannelPayload::Mode payload =
-                        (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
+    /* Send Mode selected to The Supervisor. */
+    if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent))
+    {
+        uint8_t mode_options = ReadyState::getInstance().getSelectedMode();
 
-                    isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload));
+        if (0U < mode_options)
+        {
+            SMPChannelPayload::Mode payload =
+                (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE;
 
-                    m_modeSelectionSent = true;
-                }
-            }
-            if (false == isDataSent)
+            if(false == m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)))
             {
                 /* Failed to send Mode data to the Supervisor. Go to error state. */
                 ErrorState::getInstance().setErrorMsg("MD_SF");
                 m_systemStateMachine.setState(&ErrorState::getInstance());
             }
+            else
+            {
+                m_modeSelectionSent = true;
+            }
         }
     }