adding kill model and dealy to start traning, seem to do better (wip)

MADEAPPS · Dec 18, 2023 · e95d3df · e95d3df
1 parent d883735
commit e95d3df
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 63 deletions.
diff --git a/newton-4.00/applications/ndSandbox/demos/ndCartpoleContinue.cpp b/newton-4.00/applications/ndSandbox/demos/ndCartpoleContinue.cpp
@@ -171,7 +171,7 @@ namespace ndCarpole_1
 				return m_model->IsTerminal();
 			}
 
-			void ResetModel() const
+			void ResetModel()
 			{
 				m_model->ResetModel();
 			}
@@ -316,7 +316,7 @@ namespace ndCarpole_1
 			m_pole->SetVelocity(poleVeloc);
 		}
 
-		void ResetModel() const
+		void ResetModel()
 		{
 			m_cart->SetMatrix(m_cartMatrix);
 			m_pole->SetMatrix(m_poleMatrix);

diff --git a/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp b/newton-4.00/applications/ndSandbox/demos/ndCartpoleDiscrete.cpp
@@ -164,7 +164,7 @@ namespace ndCarpole_0
 				return m_model->IsTerminal();
 			}
 
-			void ResetModel() const
+			void ResetModel()
 			{
 				m_model->ResetModel();
 			}
@@ -317,7 +317,7 @@ namespace ndCarpole_0
 			m_pole->SetVelocity(poleVeloc);
 		}
 
-		void ResetModel() const
+		void ResetModel()
 		{
 			m_cart->SetMatrix(m_cartMatrix);
 			m_pole->SetMatrix(m_poleMatrix);

diff --git a/newton-4.00/applications/ndSandbox/demos/ndQuadrupedTest_1.cpp b/newton-4.00/applications/ndSandbox/demos/ndQuadrupedTest_1.cpp
@@ -466,7 +466,7 @@ namespace ndQuadruped_1
 				return m_model->IsTerminal();
 			}
 
-			void ResetModel() const
+			void ResetModel()
 			{
 				m_model->m_control->Reset();
 				for (ndInt32 i = 0; i < m_basePose.GetCount(); i++)
@@ -1738,12 +1738,17 @@ namespace ndQuadruped_1
 			ndControllerAgent_trainer(const HyperParameters& hyperParameters)
 				:ndBrainAgentContinueVPG_Trainer<ND_AGENT_INPUTSIZE, m_actionsSize>(hyperParameters)
 				,m_bestActor(m_actor)
+				,m_basePose()
+				,m_bodies()
+				,m_outFile(nullptr)
 				,m_model(nullptr)
+				,m_timer(ndGetTimeInMicroseconds())
 				,m_maxGain(-1.0e10f)
 				,m_maxFrames(3000)
+				,m_killCounter(0)
+				,m_startTraining(0)
 				//,m_stopTraining(5000000)
 				,m_stopTraining(500000)
-				,m_timer(ndGetTimeInMicroseconds())
 				,m_modelIsTrained(false)
 			{
 				SetName(CONTROLLER_NAME);
@@ -1772,12 +1777,18 @@ namespace ndQuadruped_1
 
 			ndBrainFloat GetReward() const
 			{
+				ndBrainFloat reward = m_model->CalculateReward();
+				if (reward > ndBrainFloat(0.4f))
+				{
+					m_killCounter = 0;
+				}
+
 				if (IsTerminal())
 				{
 					return ndBrainFloat(-1.0f);
 				}
 
-				return m_model->CalculateReward();
+				return reward;
 			}
 
 			virtual void ApplyActions(ndBrainFloat* const actions) const
@@ -1812,78 +1823,106 @@ namespace ndQuadruped_1
 				////state = state && (m_startTraning >= 64);
 				//
 				//return m_model->IsTerminal();
+				if (m_killCounter > 256)
+				{
+					return true;
+				}
 				return false;
 			}
 
-			void ResetModel() const
+			void ResetModel()
 			{
+				m_killCounter = 0;
+				m_startTraining = 0;
 				m_model->m_control->Reset();
 				for (ndInt32 i = 0; i < m_basePose.GetCount(); i++)
 				{
 					m_basePose[i].SetPose();
 				}
 			}
 
+			void Step()
+			{
+				if (m_startTraining > 100)
+				{
+					ndBrainAgentContinueVPG_Trainer::Step();
+				}
+				else
+				{
+					ndBrainFixSizeVector<m_actionsSize> actions;
+					actions.Set(ndBrainFloat(0.0f));
+					m_model->ApplyActions(&actions[0]);
+				}
+			}
+
 			void OptimizeStep()
 			{
-				ndInt32 stopTraining = GetFramesCount();
-				if (stopTraining <= m_stopTraining)
+				if (m_startTraining > 100)
 				{
-					ndInt32 episodeCount = GetEposideCount();
-					ndBrainAgentContinueVPG_Trainer::OptimizeStep();
-
-					episodeCount -= GetEposideCount();
-					if (m_averageFramesPerEpisodes.GetAverage() >= ndFloat32(m_maxFrames))
+					ndInt32 stopTraining = GetFramesCount();
+					if (stopTraining <= m_stopTraining)
 					{
-						if (m_averageQvalue.GetAverage() > m_maxGain)
+						ndInt32 episodeCount = GetEposideCount();
+						ndBrainAgentContinueVPG_Trainer::OptimizeStep();
+
+						episodeCount -= GetEposideCount();
+						if (m_averageFramesPerEpisodes.GetAverage() >= ndFloat32(m_maxFrames))
 						{
-							m_bestActor.CopyFrom(m_actor);
-							m_maxGain = m_averageQvalue.GetAverage();
-							ndExpandTraceMessage("best actor episode: %d\taverageFrames: %f\taverageValue %f\n", GetEposideCount(), m_averageFramesPerEpisodes.GetAverage(), m_averageQvalue.GetAverage());
+							if (m_averageQvalue.GetAverage() > m_maxGain)
+							{
+								m_bestActor.CopyFrom(m_actor);
+								m_maxGain = m_averageQvalue.GetAverage();
+								ndExpandTraceMessage("best actor episode: %d\taverageFrames: %f\taverageValue %f\n", GetEposideCount(), m_averageFramesPerEpisodes.GetAverage(), m_averageQvalue.GetAverage());
+							}
 						}
-					}
-
-					if (episodeCount && !IsSampling())
-					{
-						ndExpandTraceMessage("step: %d\treward: %g\tframes: %g\n", GetFramesCount(), m_averageQvalue.GetAverage(), m_averageFramesPerEpisodes.GetAverage());
-						if (m_outFile)
+
+						if (episodeCount && !IsSampling())
 						{
-							fprintf(m_outFile, "%g\n", m_averageQvalue.GetAverage());
-							fflush(m_outFile);
+							ndExpandTraceMessage("step: %d\treward: %g\tframes: %g\n", GetFramesCount(), m_averageQvalue.GetAverage(), m_averageFramesPerEpisodes.GetAverage());
+							if (m_outFile)
+							{
+								fprintf(m_outFile, "%g\n", m_averageQvalue.GetAverage());
+								fflush(m_outFile);
+							}
+						}
+
+						if (stopTraining == m_stopTraining)
+						{
+							char fileName[1024];
+							m_modelIsTrained = true;
+							m_actor.CopyFrom(m_bestActor);
+							ndGetWorkingFileName(GetName().GetStr(), fileName);
+							SaveToFile(fileName);
+							ndExpandTraceMessage("saving to file: %s\n", fileName);
+							ndExpandTraceMessage("training complete\n");
+							ndUnsigned64 timer = ndGetTimeInMicroseconds() - m_timer;
+							ndExpandTraceMessage("training time: %g\n seconds", ndFloat32(ndFloat64(timer) * ndFloat32(1.0e-6f)));
 						}
 					}
-
-					if (stopTraining == m_stopTraining)
-					{
-						char fileName[1024];
-						m_modelIsTrained = true;
-						m_actor.CopyFrom(m_bestActor);
-						ndGetWorkingFileName(GetName().GetStr(), fileName);
-						SaveToFile(fileName);
-						ndExpandTraceMessage("saving to file: %s\n", fileName);
-						ndExpandTraceMessage("training complete\n");
-						ndUnsigned64 timer = ndGetTimeInMicroseconds() - m_timer;
-						ndExpandTraceMessage("training time: %g\n seconds", ndFloat32(ndFloat64(timer) * ndFloat32(1.0e-6f)));
-					}
+
+					//if (m_model->IsOutOfBounds())
+					//{
+					//	m_model->TelePort();
+					//}
+
+					m_killCounter++;
 				}
-
-				//if (m_model->IsOutOfBounds())
-				//{
-				//	m_model->TelePort();
-				//}
-				//m_startTraning++;
+
+				m_startTraining++;
 			}
 
-			FILE* m_outFile;
 			ndBrain m_bestActor;
+			ndFixSizeArray<ndBasePose, 32> m_basePose;
+			ndFixSizeArray<ndBodyDynamic*, 32> m_bodies;
+			FILE* m_outFile;
 			ndRobot* m_model;
+			ndUnsigned64 m_timer;
 			ndFloat32 m_maxGain;
 			ndInt32 m_maxFrames;
+			mutable ndInt32 m_killCounter;
+			ndInt32 m_startTraining;
 			ndInt32 m_stopTraining;
-			ndUnsigned64 m_timer;
 			bool m_modelIsTrained;
-			ndFixSizeArray<ndBasePose, 32> m_basePose;
-			ndFixSizeArray<ndBodyDynamic*, 32> m_bodies;
 		};
 
 		ndRobot(ndSharedPtr<ndBrainAgent>& agent)

diff --git a/newton-4.00/applications/ndSandbox/demos/ndUnicycle.cpp b/newton-4.00/applications/ndSandbox/demos/ndUnicycle.cpp
@@ -221,7 +221,7 @@ namespace ndUnicycle
 				return m_model->IsTerminal();
 			}
 
-			void ResetModel() const
+			void ResetModel()
 			{
 				m_model->ResetModel();
 			}
@@ -345,7 +345,7 @@ namespace ndUnicycle
 			return ndReal(reward);
 		}
 
-		void ResetModel() const
+		void ResetModel()
 		{
 			for (ndInt32 i = 0; i < m_basePose.GetCount(); i++)
 			{

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgent.h b/newton-4.00/sdk/dBrain/ndBrainAgent.h
@@ -44,7 +44,7 @@ class ndBrainAgent: public ndClassAlloc
 	virtual void InitWeights(ndBrainFloat weighVariance, ndBrainFloat biasVariance) = 0;
 
 	protected:
-	virtual void ResetModel() const = 0;
+	virtual void ResetModel() = 0;
 	virtual bool IsTerminal() const = 0;
 	virtual ndBrainFloat GetReward() const = 0;
 	virtual ndInt32 GetEpisodeFrames() const = 0;

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentContinueVPG.h b/newton-4.00/sdk/dBrain/ndBrainAgentContinueVPG.h
@@ -38,9 +38,9 @@ class ndBrainAgentContinueVPG: public ndBrainAgent
 	void Step();
 
 	protected:
+	void ResetModel();
 	void OptimizeStep();
 	bool IsTrainer() const;
-	void ResetModel() const;
 	bool IsTerminal() const;
 	ndBrainFloat GetReward() const;
 	ndInt32 GetEpisodeFrames() const;
@@ -97,7 +97,7 @@ ndBrainFloat ndBrainAgentContinueVPG<statesDim, actionDim>::GetReward() const
 }
 
 template<ndInt32 statesDim, ndInt32 actionDim>
-void ndBrainAgentContinueVPG<statesDim, actionDim>::ResetModel() const
+void ndBrainAgentContinueVPG<statesDim, actionDim>::ResetModel()
 {
 	ndAssert(0);
 }

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDDPG.h b/newton-4.00/sdk/dBrain/ndBrainAgentDDPG.h
@@ -36,9 +36,9 @@ class ndBrainAgentDDPG: public ndBrainAgent
 	void Step();
 
 	protected:
+	void ResetModel();
 	void OptimizeStep();
 	bool IsTrainer() const;
-	void ResetModel() const;
 	bool IsTerminal() const;
 	ndBrainFloat GetReward() const;
 	ndInt32 GetEpisodeFrames() const;
@@ -78,7 +78,7 @@ ndBrainFloat ndBrainAgentDDPG<statesDim, actionDim>::GetReward() const
 }
 
 template<ndInt32 statesDim, ndInt32 actionDim>
-void ndBrainAgentDDPG<statesDim, actionDim>::ResetModel() const
+void ndBrainAgentDDPG<statesDim, actionDim>::ResetModel()
 {
 	ndAssert(0);
 }

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDQN.h b/newton-4.00/sdk/dBrain/ndBrainAgentDQN.h
@@ -37,9 +37,9 @@ class ndBrainAgentDQN: public ndBrainAgent
 	void Step();
 
 	protected:
+	void ResetModel();
 	void OptimizeStep();
 	bool IsTrainer() const;
-	void ResetModel() const;
 	bool IsTerminal() const;
 	ndBrainFloat GetReward() const;
 	ndInt32 GetEpisodeFrames() const;
@@ -93,7 +93,7 @@ ndBrainFloat ndBrainAgentDQN<statesDim, actionDim>::GetReward() const
 }
 
 template<ndInt32 statesDim, ndInt32 actionDim>
-void ndBrainAgentDQN<statesDim, actionDim>::ResetModel() const
+void ndBrainAgentDQN<statesDim, actionDim>::ResetModel()
 {
 	ndAssert(0);
 }

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentDiscreteVPG.h b/newton-4.00/sdk/dBrain/ndBrainAgentDiscreteVPG.h
@@ -38,9 +38,10 @@ class ndBrainAgentDiscreteVPG: public ndBrainAgent
 	void Step();
 
 	protected:
+	void ResetModel();
 	void OptimizeStep();
 	bool IsTrainer() const;
-	void ResetModel() const;
+
 	bool IsTerminal() const;
 	ndBrainFloat GetReward() const;
 	ndInt32 GetEpisodeFrames() const;
@@ -98,7 +99,7 @@ ndBrainFloat ndBrainAgentDiscreteVPG<statesDim, actionDim>::GetReward() const
 }
 
 template<ndInt32 statesDim, ndInt32 actionDim>
-void ndBrainAgentDiscreteVPG<statesDim, actionDim>::ResetModel() const
+void ndBrainAgentDiscreteVPG<statesDim, actionDim>::ResetModel()
 {
 	ndAssert(0);
 }

diff --git a/newton-4.00/sdk/dBrain/ndBrainAgentTD3.h b/newton-4.00/sdk/dBrain/ndBrainAgentTD3.h
@@ -36,9 +36,9 @@ class ndBrainAgentTD3: public ndBrainAgent
 	void Step();
 
 	protected:
+	void ResetModel();
 	void OptimizeStep();
 	bool IsTrainer() const;
-	void ResetModel() const;
 	bool IsTerminal() const;
 	ndBrainFloat GetReward() const;
 	ndInt32 GetEpisodeFrames() const;
@@ -78,7 +78,7 @@ ndBrainFloat ndBrainAgentTD3<statesDim, actionDim>::GetReward() const
 }
 
 template<ndInt32 statesDim, ndInt32 actionDim>
-void ndBrainAgentTD3<statesDim, actionDim>::ResetModel() const
+void ndBrainAgentTD3<statesDim, actionDim>::ResetModel()
 {
 	ndAssert(0);
 }