Align with gym mujoco options (#93)

1. simplify code and fix a bug of max_episode_steps 2. add `terminate_when_unhealthy` and `exclude_current_positions_from_observation` for most of mujoco envs 3. add testing macro `ENVPOOL_TEST` 4. eliminate `info["qpos0"]` and `info["qvel0"]` when generating wheel
sail-sg · Apr 29, 2022 · 8a269d8 · 8a269d8
1 parent 79d13c2
commit 8a269d8
Show file tree

Hide file tree

Showing 22 changed files with 388 additions and 264 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -2,7 +2,8 @@ build --action_env=BAZEL_LINKLIBS=-l%:libstdc++.a:-lm
 build --action_env=BAZEL_LINKOPTS=-static-libgcc
 build --incompatible_strict_action_env --cxxopt=-std=c++17 --host_cxxopt=-std=c++17 --client_env=BAZEL_CXXOPTS=-std=c++17
 build:release --copt=-g0 --copt=-O3 --copt=-DNDEBUG --copt=-msse --copt=-msse2 --copt=-mmmx
-build:debug --compilation_mode=dbg -s
+build:debug --cxxopt=-DENVPOOL_TEST --compilation_mode=dbg -s
+build:test --cxxopt=-DENVPOOL_TEST
 
 build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_tidy_aspect
 build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config

diff --git a/Makefile b/Makefile
@@ -97,7 +97,7 @@ bazel-build: bazel-install
 	cp bazel-bin/setup.runfiles/$(PROJECT_NAME)/dist/*.whl ./dist
 
 bazel-test: bazel-install
-	bazel test --test_output=all $(BAZELOPT) //... --config=release
+	bazel test --test_output=all $(BAZELOPT) //... --config=release --config=test
 
 bazel-clean: bazel-install
 	bazel clean --expunge
@@ -154,7 +154,7 @@ release-test1:
 	cd envpool && python3 make_test.py
 
 release-test2:
-	cd examples && python3 env_step.py
+	cd examples && python3 make_env.py && python3 env_step.py
 
 release-test: release-test1 release-test2
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -29,7 +29,7 @@ def get_version() -> str:
 # -- Project information -----------------------------------------------------
 
 project = "EnvPool"
-copyright = "2021, Garena Online Private Limited"
+copyright = "2022, Garena Online Private Limited"
 author = "EnvPool Contributors"
 
 # The full version, including alpha/beta/rc tags

diff --git a/docs/pages/contributing.rst b/docs/pages/contributing.rst
@@ -66,6 +66,9 @@ integration; however, you don't want to build other stuff such as OpenCV:
 .. code-block:: bash
 
     bazel test --test_output=all //envpool/mujoco:mujoco_test --config=release
+    # or alternatively
+    cd bazel-bin/envpool/mujoco/mujoco_test.runfiles/envpool/
+    ./envpool/mujoco/mujoco_test
 
 Feel free to customize the command in ``Makefile``!
 

diff --git a/docs/pages/env.rst b/docs/pages/env.rst
@@ -389,6 +389,17 @@ Miscellaneous
     has already been defined as ``gen_`` (`link
     <https://github.com/sail-sg/envpool/blob/v0.4.0/envpool/core/env.h#L37>`_).
 
+.. note ::
+
+    ``ENVPOOL_TEST`` is a test-time macro. If you want a piece of C++ code only
+    available during unit test:
+
+    .. code-block:: c++
+
+        #ifdef ENVPOOL_TEST
+            fprintf(stderr, "here");
+        #endif
+
 
 Generate Dynamic Linked .so File and Instantiate in Python
 ----------------------------------------------------------

diff --git a/docs/pages/interface.rst b/docs/pages/interface.rst
@@ -1,5 +1,5 @@
-Interface
-=========
+Python Interface
+================
 
 envpool.make
 ------------

diff --git a/envpool/core/env.h b/envpool/core/env.h
@@ -38,7 +38,7 @@ class Env {
 
  private:
   StateBufferQueue* sbq_;
-  int order_, elapsed_step_;
+  int order_, current_step_;
   bool is_single_player_;
   StateBuffer::WritableSlice slice_;
   // for parsing single env action from input action batch
@@ -60,7 +60,7 @@ class Env {
         env_id_(env_id),
         seed_(spec.config["seed"_] + env_id),
         gen_(seed_),
-        elapsed_step_(-1),
+        current_step_(-1),
         is_single_player_(max_num_players_ == 1),
         action_specs_(spec.action_spec.template values<ShapeSpec>()),
         is_player_action_(Transform(action_specs_, [](const ShapeSpec& s) {
@@ -147,9 +147,9 @@ class Env {
     sbq_ = sbq;
     order_ = order;
     if (reset) {
-      elapsed_step_ = 0;
+      current_step_ = 0;
     } else {
-      ++elapsed_step_;
+      ++current_step_;
     }
   }
 
@@ -163,7 +163,7 @@ class Env {
     State state(&slice_.arr);
     state["done"_] = IsDone();
     state["info:env_id"_] = env_id_;
-    state["elapsed_step"_] = elapsed_step_;
+    state["elapsed_step"_] = current_step_;
     int* player_env_id(static_cast<int*>(state["info:players.env_id"_].data()));
     for (int i = 0; i < player_num; ++i) {
       player_env_id[i] = env_id_;

diff --git a/envpool/mujoco/ant.h b/envpool/mujoco/ant.h
@@ -34,6 +34,8 @@ class AntEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0),
         "frame_skip"_.bind(5), "post_constraint"_.bind(true),
+        "terminate_when_unhealthy"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(0.5),
         "contact_cost_weight"_.bind(5e-4), "healthy_reward"_.bind(1.0),
         "healthy_z_min"_.bind(0.2), "healthy_z_max"_.bind(1.0),
@@ -43,19 +45,22 @@ class AntEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({111}, {-inf, inf})),
-                    "info:reward_forward"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_contact"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_survive"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({15})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({14})));
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict(
+        "obs"_.bind(Spec<mjtNum>({no_pos ? 111 : 113}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
+        "info:qpos0"_.bind(Spec<mjtNum>({15})),
+        "info:qvel0"_.bind(Spec<mjtNum>({14})),
+#endif
+        "info:reward_forward"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_contact"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_survive"_.bind(Spec<mjtNum>({-1})),
+        "info:x_position"_.bind(Spec<mjtNum>({-1})),
+        "info:y_position"_.bind(Spec<mjtNum>({-1})),
+        "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
+        "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
+        "info:y_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -67,23 +72,22 @@ typedef class EnvSpec<AntEnvFns> AntEnvSpec;
 
 class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool terminate_when_unhealthy_, no_pos_;
   mjtNum ctrl_cost_weight_, contact_cost_weight_;
   mjtNum forward_reward_weight_, healthy_reward_;
   mjtNum healthy_z_min_, healthy_z_max_;
   mjtNum contact_force_min_, contact_force_max_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_;
   std::normal_distribution<> dist_qvel_;
-  bool done_;
 
  public:
   AntEnv(const Spec& spec, int env_id)
       : Env<AntEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/ant.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         contact_cost_weight_(spec.config["contact_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
@@ -92,19 +96,16 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
         healthy_z_max_(spec.config["healthy_z_max"_]),
         contact_force_min_(spec.config["contact_force_min"_]),
         contact_force_max_(spec.config["contact_force_max"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_(-spec.config["reset_noise_scale"_],
                    spec.config["reset_noise_scale"_]),
-        dist_qvel_(0, spec.config["reset_noise_scale"_]),
-        done_(true) {}
+        dist_qvel_(0, spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
     }
   }
 
@@ -114,7 +115,7 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
     done_ = false;
     elapsed_step_ = 0;
     MujocoReset();
-    WriteObs(0.0f, 0, 0, 0, 0, 0, 0);
+    WriteObs(0.0f, 0, 0, 0, 0, 0, 0, 0);
   }
 
   void Step(const Action& action) override {
@@ -141,13 +142,16 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
       x = std::max(contact_force_min_, x);
       contact_cost += contact_cost_weight_ * x * x;
     }
-
     // reward and done
-    float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost -
-                   contact_cost;
+    mjtNum healthy_reward =
+        terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0;
+    float reward =
+        xv * forward_reward_weight_ + healthy_reward - ctrl_cost - contact_cost;
     ++elapsed_step_;
-    done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_);
-    WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after);
+    done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) ||
+            (elapsed_step_ >= max_episode_steps_);
+    WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after,
+             healthy_reward);
   }
 
  private:
@@ -169,12 +173,13 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
   }
 
   void WriteObs(float reward, mjtNum xv, mjtNum yv, mjtNum ctrl_cost,
-                mjtNum contact_cost, mjtNum x_after, mjtNum y_after) {
+                mjtNum contact_cost, mjtNum x_after, mjtNum y_after,
+                mjtNum healthy_reward) {
     State state = Allocate();
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 2; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -190,15 +195,17 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
     state["info:reward_forward"_] = xv * forward_reward_weight_;
     state["info:reward_ctrl"_] = -ctrl_cost;
     state["info:reward_contact"_] = -contact_cost;
-    state["info:reward_survive"_] = healthy_reward_;
+    state["info:reward_survive"_] = healthy_reward;
     state["info:x_position"_] = x_after;
     state["info:y_position"_] = y_after;
     state["info:distance_from_origin"_] =
         std::sqrt(x_after * x_after + y_after * y_after);
     state["info:x_velocity"_] = xv;
     state["info:y_velocity"_] = yv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 

diff --git a/envpool/mujoco/half_cheetah.h b/envpool/mujoco/half_cheetah.h
@@ -34,20 +34,23 @@ class HalfCheetahEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(4800.0),
         "frame_skip"_.bind(5), "post_constraint"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "ctrl_cost_weight"_.bind(0.1), "forward_reward_weight"_.bind(1.0),
         "reset_noise_scale"_.bind(0.1));
   }
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({17}, {-inf, inf})),
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict("obs"_.bind(Spec<mjtNum>({no_pos ? 17 : 18}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
+                    "info:qpos0"_.bind(Spec<mjtNum>({9})),
+                    "info:qvel0"_.bind(Spec<mjtNum>({9})),
+#endif
                     "info:reward_run"_.bind(Spec<mjtNum>({-1})),
                     "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
                     "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({9})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({9})));
+                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -59,35 +62,30 @@ typedef class EnvSpec<HalfCheetahEnvFns> HalfCheetahEnvSpec;
 
 class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool no_pos_;
   mjtNum ctrl_cost_weight_, forward_reward_weight_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_;
   std::normal_distribution<> dist_qvel_;
-  bool done_;
 
  public:
   HalfCheetahEnv(const Spec& spec, int env_id)
       : Env<HalfCheetahEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/half_cheetah.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_(-spec.config["reset_noise_scale"_],
                    spec.config["reset_noise_scale"_]),
-        dist_qvel_(0, spec.config["reset_noise_scale"_]),
-        done_(true) {}
+        dist_qvel_(0, spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
     }
   }
 
@@ -127,7 +125,7 @@ class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 1; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -138,8 +136,10 @@ class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
     state["info:reward_ctrl"_] = -ctrl_cost;
     state["info:x_position"_] = x_after;
     state["info:x_velocity"_] = xv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };