From 8a269d86f20722a6375ac566675a9fdeaf226fe5 Mon Sep 17 00:00:00 2001
From: Jiayi Weng <trinkle23897@gmail.com>
Date: Sat, 30 Apr 2022 06:02:39 +0800
Subject: [PATCH] Align with gym mujoco options (#93)

1. simplify code and fix a bug of max_episode_steps
2. add `terminate_when_unhealthy` and `exclude_current_positions_from_observation` for most of mujoco envs
3. add testing macro `ENVPOOL_TEST`
4. eliminate `info["qpos0"]` and `info["qvel0"]` when generating wheel
---
 .bazelrc                                  |  3 +-
 Makefile                                  |  4 +-
 docs/conf.py                              |  2 +-
 docs/pages/contributing.rst               |  3 +
 docs/pages/env.rst                        | 11 ++++
 docs/pages/interface.rst                  |  4 +-
 envpool/core/env.h                        | 10 +--
 envpool/mujoco/ant.h                      | 79 ++++++++++++-----------
 envpool/mujoco/half_cheetah.h             | 40 ++++++------
 envpool/mujoco/hopper.h                   | 49 +++++++-------
 envpool/mujoco/humanoid.h                 | 78 ++++++++++++----------
 envpool/mujoco/humanoid_standup.h         | 47 +++++++-------
 envpool/mujoco/inverted_double_pendulum.h | 28 ++++----
 envpool/mujoco/inverted_pendulum.h        | 28 ++++----
 envpool/mujoco/mujoco_env.h               | 15 ++++-
 envpool/mujoco/mujoco_test.py             | 78 ++++++++++++++++++++++
 envpool/mujoco/pusher.h                   | 40 ++++++------
 envpool/mujoco/reacher.h                  | 36 +++++------
 envpool/mujoco/registration.py            |  1 -
 envpool/mujoco/swimmer.h                  | 40 ++++++------
 envpool/mujoco/walker2d.h                 | 49 +++++++-------
 examples/make_env.py                      |  7 ++
 22 files changed, 388 insertions(+), 264 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index fcb2ce7a..f3fdebe0 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -2,7 +2,8 @@ build --action_env=BAZEL_LINKLIBS=-l%:libstdc++.a:-lm
 build --action_env=BAZEL_LINKOPTS=-static-libgcc
 build --incompatible_strict_action_env --cxxopt=-std=c++17 --host_cxxopt=-std=c++17 --client_env=BAZEL_CXXOPTS=-std=c++17
 build:release --copt=-g0 --copt=-O3 --copt=-DNDEBUG --copt=-msse --copt=-msse2 --copt=-mmmx
-build:debug --compilation_mode=dbg -s
+build:debug --cxxopt=-DENVPOOL_TEST --compilation_mode=dbg -s
+build:test --cxxopt=-DENVPOOL_TEST
 
 build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_tidy_aspect
 build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config
diff --git a/Makefile b/Makefile
index 6d2fadd5..05df27a3 100644
--- a/Makefile
+++ b/Makefile
@@ -97,7 +97,7 @@ bazel-build: bazel-install
 	cp bazel-bin/setup.runfiles/$(PROJECT_NAME)/dist/*.whl ./dist
 
 bazel-test: bazel-install
-	bazel test --test_output=all $(BAZELOPT) //... --config=release
+	bazel test --test_output=all $(BAZELOPT) //... --config=release --config=test
 
 bazel-clean: bazel-install
 	bazel clean --expunge
@@ -154,7 +154,7 @@ release-test1:
 	cd envpool && python3 make_test.py
 
 release-test2:
-	cd examples && python3 env_step.py
+	cd examples && python3 make_env.py && python3 env_step.py
 
 release-test: release-test1 release-test2
 
diff --git a/docs/conf.py b/docs/conf.py
index 63e1fa7f..9e68bd2d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,7 +29,7 @@ def get_version() -> str:
 # -- Project information -----------------------------------------------------
 
 project = "EnvPool"
-copyright = "2021, Garena Online Private Limited"
+copyright = "2022, Garena Online Private Limited"
 author = "EnvPool Contributors"
 
 # The full version, including alpha/beta/rc tags
diff --git a/docs/pages/contributing.rst b/docs/pages/contributing.rst
index 6e753a4e..f5d03f79 100644
--- a/docs/pages/contributing.rst
+++ b/docs/pages/contributing.rst
@@ -66,6 +66,9 @@ integration; however, you don't want to build other stuff such as OpenCV:
 .. code-block:: bash
 
     bazel test --test_output=all //envpool/mujoco:mujoco_test --config=release
+    # or alternatively
+    cd bazel-bin/envpool/mujoco/mujoco_test.runfiles/envpool/
+    ./envpool/mujoco/mujoco_test
 
 Feel free to customize the command in ``Makefile``!
 
diff --git a/docs/pages/env.rst b/docs/pages/env.rst
index 8c2744f5..67e0dc59 100644
--- a/docs/pages/env.rst
+++ b/docs/pages/env.rst
@@ -389,6 +389,17 @@ Miscellaneous
     has already been defined as ``gen_`` (`link
     <https://github.com/sail-sg/envpool/blob/v0.4.0/envpool/core/env.h#L37>`_).
 
+.. note ::
+
+    ``ENVPOOL_TEST`` is a test-time macro. If you want a piece of C++ code only
+    available during unit test:
+
+    .. code-block:: c++
+
+        #ifdef ENVPOOL_TEST
+            fprintf(stderr, "here");
+        #endif
+
 
 Generate Dynamic Linked .so File and Instantiate in Python
 ----------------------------------------------------------
diff --git a/docs/pages/interface.rst b/docs/pages/interface.rst
index 98a4c611..e098a254 100644
--- a/docs/pages/interface.rst
+++ b/docs/pages/interface.rst
@@ -1,5 +1,5 @@
-Interface
-=========
+Python Interface
+================
 
 envpool.make
 ------------
diff --git a/envpool/core/env.h b/envpool/core/env.h
index 5e363632..8b3583a2 100644
--- a/envpool/core/env.h
+++ b/envpool/core/env.h
@@ -38,7 +38,7 @@ class Env {
 
  private:
   StateBufferQueue* sbq_;
-  int order_, elapsed_step_;
+  int order_, current_step_;
   bool is_single_player_;
   StateBuffer::WritableSlice slice_;
   // for parsing single env action from input action batch
@@ -60,7 +60,7 @@ class Env {
         env_id_(env_id),
         seed_(spec.config["seed"_] + env_id),
         gen_(seed_),
-        elapsed_step_(-1),
+        current_step_(-1),
         is_single_player_(max_num_players_ == 1),
         action_specs_(spec.action_spec.template values<ShapeSpec>()),
         is_player_action_(Transform(action_specs_, [](const ShapeSpec& s) {
@@ -147,9 +147,9 @@ class Env {
     sbq_ = sbq;
     order_ = order;
     if (reset) {
-      elapsed_step_ = 0;
+      current_step_ = 0;
     } else {
-      ++elapsed_step_;
+      ++current_step_;
     }
   }
 
@@ -163,7 +163,7 @@ class Env {
     State state(&slice_.arr);
     state["done"_] = IsDone();
     state["info:env_id"_] = env_id_;
-    state["elapsed_step"_] = elapsed_step_;
+    state["elapsed_step"_] = current_step_;
     int* player_env_id(static_cast<int*>(state["info:players.env_id"_].data()));
     for (int i = 0; i < player_num; ++i) {
       player_env_id[i] = env_id_;
diff --git a/envpool/mujoco/ant.h b/envpool/mujoco/ant.h
index b371ccba..225d16c1 100644
--- a/envpool/mujoco/ant.h
+++ b/envpool/mujoco/ant.h
@@ -34,6 +34,8 @@ class AntEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0),
         "frame_skip"_.bind(5), "post_constraint"_.bind(true),
+        "terminate_when_unhealthy"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(0.5),
         "contact_cost_weight"_.bind(5e-4), "healthy_reward"_.bind(1.0),
         "healthy_z_min"_.bind(0.2), "healthy_z_max"_.bind(1.0),
@@ -43,19 +45,22 @@ class AntEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({111}, {-inf, inf})),
-                    "info:reward_forward"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_contact"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_survive"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({15})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({14})));
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict(
+        "obs"_.bind(Spec<mjtNum>({no_pos ? 111 : 113}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
+        "info:qpos0"_.bind(Spec<mjtNum>({15})),
+        "info:qvel0"_.bind(Spec<mjtNum>({14})),
+#endif
+        "info:reward_forward"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_contact"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_survive"_.bind(Spec<mjtNum>({-1})),
+        "info:x_position"_.bind(Spec<mjtNum>({-1})),
+        "info:y_position"_.bind(Spec<mjtNum>({-1})),
+        "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
+        "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
+        "info:y_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -67,23 +72,22 @@ typedef class EnvSpec<AntEnvFns> AntEnvSpec;
 
 class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool terminate_when_unhealthy_, no_pos_;
   mjtNum ctrl_cost_weight_, contact_cost_weight_;
   mjtNum forward_reward_weight_, healthy_reward_;
   mjtNum healthy_z_min_, healthy_z_max_;
   mjtNum contact_force_min_, contact_force_max_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_;
   std::normal_distribution<> dist_qvel_;
-  bool done_;
 
  public:
   AntEnv(const Spec& spec, int env_id)
       : Env<AntEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/ant.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         contact_cost_weight_(spec.config["contact_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
@@ -92,19 +96,16 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
         healthy_z_max_(spec.config["healthy_z_max"_]),
         contact_force_min_(spec.config["contact_force_min"_]),
         contact_force_max_(spec.config["contact_force_max"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_(-spec.config["reset_noise_scale"_],
                    spec.config["reset_noise_scale"_]),
-        dist_qvel_(0, spec.config["reset_noise_scale"_]),
-        done_(true) {}
+        dist_qvel_(0, spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
     }
   }
 
@@ -114,7 +115,7 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
     done_ = false;
     elapsed_step_ = 0;
     MujocoReset();
-    WriteObs(0.0f, 0, 0, 0, 0, 0, 0);
+    WriteObs(0.0f, 0, 0, 0, 0, 0, 0, 0);
   }
 
   void Step(const Action& action) override {
@@ -141,13 +142,16 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
       x = std::max(contact_force_min_, x);
       contact_cost += contact_cost_weight_ * x * x;
     }
-
     // reward and done
-    float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost -
-                   contact_cost;
+    mjtNum healthy_reward =
+        terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0;
+    float reward =
+        xv * forward_reward_weight_ + healthy_reward - ctrl_cost - contact_cost;
     ++elapsed_step_;
-    done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_);
-    WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after);
+    done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) ||
+            (elapsed_step_ >= max_episode_steps_);
+    WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after,
+             healthy_reward);
   }
 
  private:
@@ -169,12 +173,13 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
   }
 
   void WriteObs(float reward, mjtNum xv, mjtNum yv, mjtNum ctrl_cost,
-                mjtNum contact_cost, mjtNum x_after, mjtNum y_after) {
+                mjtNum contact_cost, mjtNum x_after, mjtNum y_after,
+                mjtNum healthy_reward) {
     State state = Allocate();
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 2; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -190,15 +195,17 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
     state["info:reward_forward"_] = xv * forward_reward_weight_;
     state["info:reward_ctrl"_] = -ctrl_cost;
     state["info:reward_contact"_] = -contact_cost;
-    state["info:reward_survive"_] = healthy_reward_;
+    state["info:reward_survive"_] = healthy_reward;
     state["info:x_position"_] = x_after;
     state["info:y_position"_] = y_after;
     state["info:distance_from_origin"_] =
         std::sqrt(x_after * x_after + y_after * y_after);
     state["info:x_velocity"_] = xv;
     state["info:y_velocity"_] = yv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/half_cheetah.h b/envpool/mujoco/half_cheetah.h
index e530b9f7..322a484a 100644
--- a/envpool/mujoco/half_cheetah.h
+++ b/envpool/mujoco/half_cheetah.h
@@ -34,20 +34,23 @@ class HalfCheetahEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(4800.0),
         "frame_skip"_.bind(5), "post_constraint"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "ctrl_cost_weight"_.bind(0.1), "forward_reward_weight"_.bind(1.0),
         "reset_noise_scale"_.bind(0.1));
   }
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({17}, {-inf, inf})),
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict("obs"_.bind(Spec<mjtNum>({no_pos ? 17 : 18}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
+                    "info:qpos0"_.bind(Spec<mjtNum>({9})),
+                    "info:qvel0"_.bind(Spec<mjtNum>({9})),
+#endif
                     "info:reward_run"_.bind(Spec<mjtNum>({-1})),
                     "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
                     "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({9})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({9})));
+                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -59,35 +62,30 @@ typedef class EnvSpec<HalfCheetahEnvFns> HalfCheetahEnvSpec;
 
 class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool no_pos_;
   mjtNum ctrl_cost_weight_, forward_reward_weight_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_;
   std::normal_distribution<> dist_qvel_;
-  bool done_;
 
  public:
   HalfCheetahEnv(const Spec& spec, int env_id)
       : Env<HalfCheetahEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/half_cheetah.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_(-spec.config["reset_noise_scale"_],
                    spec.config["reset_noise_scale"_]),
-        dist_qvel_(0, spec.config["reset_noise_scale"_]),
-        done_(true) {}
+        dist_qvel_(0, spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
     }
   }
 
@@ -127,7 +125,7 @@ class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 1; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -138,8 +136,10 @@ class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
     state["info:reward_ctrl"_] = -ctrl_cost;
     state["info:x_position"_] = x_after;
     state["info:x_velocity"_] = xv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/hopper.h b/envpool/mujoco/hopper.h
index 30b47a41..e6b72488 100644
--- a/envpool/mujoco/hopper.h
+++ b/envpool/mujoco/hopper.h
@@ -34,6 +34,8 @@ class HopperEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0),
         "frame_skip"_.bind(4), "post_constraint"_.bind(true),
+        "terminate_when_unhealthy"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "ctrl_cost_weight"_.bind(1e-3), "forward_reward_weight"_.bind(1.0),
         "healthy_reward"_.bind(1.0), "velocity_min"_.bind(-10.0),
         "velocity_max"_.bind(10.0), "healthy_state_min"_.bind(-100.0),
@@ -44,12 +46,14 @@ class HopperEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({11}, {-inf, inf})),
-                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict("obs"_.bind(Spec<mjtNum>({no_pos ? 11 : 12}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
                     "info:qpos0"_.bind(Spec<mjtNum>({6})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({6})));
+                    "info:qvel0"_.bind(Spec<mjtNum>({6})),
+#endif
+                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
+                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -61,23 +65,22 @@ typedef class EnvSpec<HopperEnvFns> HopperEnvSpec;
 
 class HopperEnv : public Env<HopperEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool terminate_when_unhealthy_, no_pos_;
   mjtNum ctrl_cost_weight_, forward_reward_weight_;
   mjtNum healthy_reward_, healthy_z_min_;
   mjtNum velocity_min_, velocity_max_;
   mjtNum healthy_state_min_, healthy_state_max_;
   mjtNum healthy_angle_min_, healthy_angle_max_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_;
-  bool done_;
 
  public:
   HopperEnv(const Spec& spec, int env_id)
       : Env<HopperEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/hopper.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
         healthy_reward_(spec.config["healthy_reward"_]),
@@ -88,18 +91,15 @@ class HopperEnv : public Env<HopperEnvSpec>, public MujocoEnv {
         healthy_state_max_(spec.config["healthy_state_max"_]),
         healthy_angle_min_(spec.config["healthy_angle_min"_]),
         healthy_angle_max_(spec.config["healthy_angle_max"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_(-spec.config["reset_noise_scale"_],
-              spec.config["reset_noise_scale"_]),
-        done_(true) {}
+              spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_);
     }
   }
 
@@ -128,9 +128,12 @@ class HopperEnv : public Env<HopperEnvSpec>, public MujocoEnv {
     mjtNum dt = frame_skip_ * model_->opt.timestep;
     mjtNum xv = (x_after - x_before) / dt;
     // reward and done
-    float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost;
+    mjtNum healthy_reward =
+        terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0;
+    float reward = xv * forward_reward_weight_ + healthy_reward - ctrl_cost;
     ++elapsed_step_;
-    done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_);
+    done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) ||
+            (elapsed_step_ >= max_episode_steps_);
     WriteObs(reward, xv, x_after);
   }
 
@@ -161,7 +164,7 @@ class HopperEnv : public Env<HopperEnvSpec>, public MujocoEnv {
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 1; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -173,8 +176,10 @@ class HopperEnv : public Env<HopperEnvSpec>, public MujocoEnv {
     // info
     state["info:x_position"_] = x_after;
     state["info:x_velocity"_] = xv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/humanoid.h b/envpool/mujoco/humanoid.h
index 249e4496..9c8c3d96 100644
--- a/envpool/mujoco/humanoid.h
+++ b/envpool/mujoco/humanoid.h
@@ -34,6 +34,8 @@ class HumanoidEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "frame_skip"_.bind(5),
         "post_constraint"_.bind(true), "forward_reward_weight"_.bind(1.25),
+        "terminate_when_unhealthy"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "ctrl_cost_weight"_.bind(0.1), "contact_cost_weight"_.bind(5e-7),
         "contact_cost_max"_.bind(10.0), "healthy_reward"_.bind(5.0),
         "healthy_z_min"_.bind(1.0), "healthy_z_max"_.bind(2.0),
@@ -42,19 +44,22 @@ class HumanoidEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({376}, {-inf, inf})),
-                    "info:reward_linvel"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_quadctrl"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_alive"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_impact"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({24})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({23})));
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict(
+        "obs"_.bind(Spec<mjtNum>({no_pos ? 376 : 378}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
+        "info:qpos0"_.bind(Spec<mjtNum>({24})),
+        "info:qvel0"_.bind(Spec<mjtNum>({23})),
+#endif
+        "info:reward_linvel"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_quadctrl"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_alive"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_impact"_.bind(Spec<mjtNum>({-1})),
+        "info:x_position"_.bind(Spec<mjtNum>({-1})),
+        "info:y_position"_.bind(Spec<mjtNum>({-1})),
+        "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
+        "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
+        "info:y_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -66,22 +71,21 @@ typedef class EnvSpec<HumanoidEnvFns> HumanoidEnvSpec;
 
 class HumanoidEnv : public Env<HumanoidEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool terminate_when_unhealthy_, no_pos_;
   mjtNum ctrl_cost_weight_, contact_cost_weight_, contact_cost_max_;
   mjtNum forward_reward_weight_, healthy_reward_;
   mjtNum healthy_z_min_, healthy_z_max_;
   mjtNum mass_x_, mass_y_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_;
-  bool done_;
 
  public:
   HumanoidEnv(const Spec& spec, int env_id)
       : Env<HumanoidEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/humanoid.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         contact_cost_weight_(spec.config["contact_cost_weight"_]),
         contact_cost_max_(spec.config["contact_cost_max"_]),
@@ -91,18 +95,15 @@ class HumanoidEnv : public Env<HumanoidEnvSpec>, public MujocoEnv {
         healthy_z_max_(spec.config["healthy_z_max"_]),
         mass_x_(0),
         mass_y_(0),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_(-spec.config["reset_noise_scale"_],
-              spec.config["reset_noise_scale"_]),
-        done_(true) {}
+              spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_);
     }
   }
 
@@ -112,7 +113,7 @@ class HumanoidEnv : public Env<HumanoidEnvSpec>, public MujocoEnv {
     done_ = false;
     elapsed_step_ = 0;
     MujocoReset();
-    WriteObs(0.0f, 0, 0, 0, 0, 0, 0);
+    WriteObs(0.0f, 0, 0, 0, 0, 0, 0, 0);
   }
 
   void Step(const Action& action) override {
@@ -142,11 +143,15 @@ class HumanoidEnv : public Env<HumanoidEnvSpec>, public MujocoEnv {
     contact_cost = std::min(contact_cost, contact_cost_max_);
 
     // reward and done
-    float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost -
-                   contact_cost;
+    mjtNum healthy_reward =
+        terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0;
+    float reward =
+        xv * forward_reward_weight_ + healthy_reward - ctrl_cost - contact_cost;
     ++elapsed_step_;
-    done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_);
-    WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after);
+    done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) ||
+            (elapsed_step_ >= max_episode_steps_);
+    WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after,
+             healthy_reward);
   }
 
  private:
@@ -168,12 +173,13 @@ class HumanoidEnv : public Env<HumanoidEnvSpec>, public MujocoEnv {
   }
 
   void WriteObs(float reward, mjtNum xv, mjtNum yv, mjtNum ctrl_cost,
-                mjtNum contact_cost, mjtNum x_after, mjtNum y_after) {
+                mjtNum contact_cost, mjtNum x_after, mjtNum y_after,
+                mjtNum healthy_reward) {
     State state = Allocate();
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 2; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -195,15 +201,17 @@ class HumanoidEnv : public Env<HumanoidEnvSpec>, public MujocoEnv {
     state["info:reward_linvel"_] = xv * forward_reward_weight_;
     state["info:reward_quadctrl"_] = -ctrl_cost;
     state["info:reward_impact"_] = -contact_cost;
-    state["info:reward_alive"_] = healthy_reward_;
+    state["info:reward_alive"_] = healthy_reward;
     state["info:x_position"_] = x_after;
     state["info:y_position"_] = y_after;
     state["info:distance_from_origin"_] =
         std::sqrt(x_after * x_after + y_after * y_after);
     state["info:x_velocity"_] = xv;
     state["info:y_velocity"_] = yv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/humanoid_standup.h b/envpool/mujoco/humanoid_standup.h
index e2a09945..d8dd314d 100644
--- a/envpool/mujoco/humanoid_standup.h
+++ b/envpool/mujoco/humanoid_standup.h
@@ -34,6 +34,7 @@ class HumanoidStandupEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "frame_skip"_.bind(5),
         "post_constraint"_.bind(true), "forward_reward_weight"_.bind(1.0),
+        "exclude_current_positions_from_observation"_.bind(true),
         "ctrl_cost_weight"_.bind(0.1), "contact_cost_weight"_.bind(5e-7),
         "contact_cost_max"_.bind(10.0), "healthy_reward"_.bind(1.0),
         "reset_noise_scale"_.bind(1e-2));
@@ -41,14 +42,17 @@ class HumanoidStandupEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({376}, {-inf, inf})),
-                    "info:reward_linup"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_quadctrl"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_alive"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_impact"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({24})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({23})));
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict(
+#ifdef ENVPOOL_TEST
+        "info:qpos0"_.bind(Spec<mjtNum>({24})),
+        "info:qvel0"_.bind(Spec<mjtNum>({23})),
+#endif
+        "obs"_.bind(Spec<mjtNum>({no_pos ? 376 : 378}, {-inf, inf})),
+        "info:reward_linup"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_quadctrl"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_alive"_.bind(Spec<mjtNum>({-1})),
+        "info:reward_impact"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -61,38 +65,33 @@ typedef class EnvSpec<HumanoidStandupEnvFns> HumanoidStandupEnvSpec;
 class HumanoidStandupEnv : public Env<HumanoidStandupEnvSpec>,
                            public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool no_pos_;
   mjtNum ctrl_cost_weight_, contact_cost_weight_, contact_cost_max_;
   mjtNum forward_reward_weight_, healthy_reward_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_;
-  bool done_;
 
  public:
   HumanoidStandupEnv(const Spec& spec, int env_id)
       : Env<HumanoidStandupEnvSpec>(spec, env_id),
         MujocoEnv(
             spec.config["base_path"_] + "/mujoco/assets/humanoidstandup.xml",
-            spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+            spec.config["frame_skip"_], spec.config["post_constraint"_],
+            spec.config["max_episode_steps"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         contact_cost_weight_(spec.config["contact_cost_weight"_]),
         contact_cost_max_(spec.config["contact_cost_max"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
         healthy_reward_(spec.config["healthy_reward"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_(-spec.config["reset_noise_scale"_],
-              spec.config["reset_noise_scale"_]),
-        done_(true) {}
+              spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_);
     }
   }
 
@@ -139,7 +138,7 @@ class HumanoidStandupEnv : public Env<HumanoidStandupEnvSpec>,
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 2; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -162,8 +161,10 @@ class HumanoidStandupEnv : public Env<HumanoidStandupEnvSpec>,
     state["info:reward_quadctrl"_] = -ctrl_cost;
     state["info:reward_impact"_] = -contact_cost;
     state["info:reward_alive"_] = healthy_reward_;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/inverted_double_pendulum.h b/envpool/mujoco/inverted_double_pendulum.h
index c57c2195..eba930b9 100644
--- a/envpool/mujoco/inverted_double_pendulum.h
+++ b/envpool/mujoco/inverted_double_pendulum.h
@@ -41,10 +41,13 @@ class InvertedDoublePendulumEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
+#ifdef ENVPOOL_TEST
     return MakeDict("obs"_.bind(Spec<mjtNum>({11}, {-inf, inf})),
-                    // TODO(jiayi): remove these two lines for speed
                     "info:qpos0"_.bind(Spec<mjtNum>({3})),
                     "info:qvel0"_.bind(Spec<mjtNum>({3})));
+#else
+    return MakeDict("obs"_.bind(Spec<mjtNum>({11}, {-inf, inf})));
+#endif
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -58,39 +61,32 @@ typedef class EnvSpec<InvertedDoublePendulumEnvFns>
 class InvertedDoublePendulumEnv : public Env<InvertedDoublePendulumEnvSpec>,
                                   public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
   mjtNum healthy_reward_, healthy_z_max_;
   mjtNum observation_min_, observation_max_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_;
   std::normal_distribution<> dist_qvel_;
-  bool done_;
 
  public:
   InvertedDoublePendulumEnv(const Spec& spec, int env_id)
       : Env<InvertedDoublePendulumEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] +
                       "/mujoco/assets/inverted_double_pendulum.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
         healthy_reward_(spec.config["healthy_reward"_]),
         healthy_z_max_(spec.config["healthy_z_max"_]),
         observation_min_(spec.config["observation_min"_]),
         observation_max_(spec.config["observation_max"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_(-spec.config["reset_noise_scale"_],
                    spec.config["reset_noise_scale"_]),
-        dist_qvel_(0, spec.config["reset_noise_scale"_]),
-        done_(true) {}
+        dist_qvel_(0, spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
     }
   }
 
@@ -148,8 +144,10 @@ class InvertedDoublePendulumEnv : public Env<InvertedDoublePendulumEnvSpec>,
       *(obs++) = x;
     }
     // info
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/inverted_pendulum.h b/envpool/mujoco/inverted_pendulum.h
index aba0896b..cfad7a46 100644
--- a/envpool/mujoco/inverted_pendulum.h
+++ b/envpool/mujoco/inverted_pendulum.h
@@ -40,10 +40,13 @@ class InvertedPendulumEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
+#ifdef ENVPOOL_TEST
     return MakeDict("obs"_.bind(Spec<mjtNum>({4}, {-inf, inf})),
-                    // TODO(jiayi): remove these two lines for speed
                     "info:qpos0"_.bind(Spec<mjtNum>({2})),
                     "info:qvel0"_.bind(Spec<mjtNum>({2})));
+#else
+    return MakeDict("obs"_.bind(Spec<mjtNum>({4}, {-inf, inf})));
+#endif
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -56,35 +59,28 @@ typedef class EnvSpec<InvertedPendulumEnvFns> InvertedPendulumEnvSpec;
 class InvertedPendulumEnv : public Env<InvertedPendulumEnvSpec>,
                             public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
   mjtNum healthy_reward_, healthy_z_min_, healthy_z_max_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_;
-  bool done_;
 
  public:
   InvertedPendulumEnv(const Spec& spec, int env_id)
       : Env<InvertedPendulumEnvSpec>(spec, env_id),
         MujocoEnv(
             spec.config["base_path"_] + "/mujoco/assets/inverted_pendulum.xml",
-            spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+            spec.config["frame_skip"_], spec.config["post_constraint"_],
+            spec.config["max_episode_steps"_]),
         healthy_reward_(spec.config["healthy_reward"_]),
         healthy_z_min_(spec.config["healthy_z_min"_]),
         healthy_z_max_(spec.config["healthy_z_max"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_(-spec.config["reset_noise_scale"_],
-              spec.config["reset_noise_scale"_]),
-        done_(true) {}
+              spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_);
     }
   }
 
@@ -137,8 +133,10 @@ class InvertedPendulumEnv : public Env<InvertedPendulumEnvSpec>,
       *(obs++) = data_->qvel[i];
     }
     // info
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/mujoco_env.h b/envpool/mujoco/mujoco_env.h
index 71991c97..5e5c3a17 100644
--- a/envpool/mujoco/mujoco_env.h
+++ b/envpool/mujoco/mujoco_env.h
@@ -30,17 +30,26 @@ class MujocoEnv {
   mjModel* model_;
   mjData* data_;
   mjtNum *init_qpos_, *init_qvel_;
+  mjtNum *qpos0_, *qvel0_;  // for align check
   int frame_skip_;
   bool post_constraint_;
+  int max_episode_steps_, elapsed_step_;
+  bool done_;
 
  public:
-  MujocoEnv(std::string xml, int frame_skip, bool post_constraint)
+  MujocoEnv(std::string xml, int frame_skip, bool post_constraint,
+            int max_episode_steps)
       : model_(mj_loadXML(xml.c_str(), 0, error_, 1000)),
         data_(mj_makeData(model_)),
         init_qpos_(new mjtNum[model_->nq]),
         init_qvel_(new mjtNum[model_->nv]),
+        qpos0_(new mjtNum[model_->nq]),
+        qvel0_(new mjtNum[model_->nv]),
         frame_skip_(frame_skip),
-        post_constraint_(post_constraint) {
+        post_constraint_(post_constraint),
+        max_episode_steps_(max_episode_steps),
+        elapsed_step_(max_episode_steps + 1),
+        done_(true) {
     memcpy(init_qpos_, data_->qpos, sizeof(mjtNum) * model_->nq);
     memcpy(init_qvel_, data_->qvel, sizeof(mjtNum) * model_->nv);
   }
@@ -50,6 +59,8 @@ class MujocoEnv {
     mj_deleteModel(model_);
     delete[] init_qpos_;
     delete[] init_qvel_;
+    delete[] qpos0_;
+    delete[] qvel0_;
   }
 
   void MujocoReset() {
diff --git a/envpool/mujoco/mujoco_test.py b/envpool/mujoco/mujoco_test.py
index a59586b6..6f3a1851 100644
--- a/envpool/mujoco/mujoco_test.py
+++ b/envpool/mujoco/mujoco_test.py
@@ -130,6 +130,21 @@ def test_ant(self) -> None:
     env1 = AntGymEnvPool(AntEnvSpec(AntEnvSpec.gen_config()))
     self.run_space_check(env0, env1)
     self.run_align_check(env0, env1)
+    env0 = mjc_mwe.AntEnv(
+      terminate_when_unhealthy=False,
+      exclude_current_positions_from_observation=False,
+    )
+    env1 = AntGymEnvPool(
+      AntEnvSpec(
+        AntEnvSpec.gen_config(
+          terminate_when_unhealthy=False,
+          exclude_current_positions_from_observation=False,
+          max_episode_steps=100,
+        )
+      )
+    )
+    self.run_space_check(env0, env1)
+    self.run_align_check(env0, env1, no_time_limit=True)
     self.run_deterministic_check(AntEnvSpec, AntGymEnvPool)
 
   def test_half_cheetah(self) -> None:
@@ -139,6 +154,17 @@ def test_half_cheetah(self) -> None:
     )
     self.run_space_check(env0, env1)
     self.run_align_check(env0, env1, no_time_limit=True)
+    env0 = mjc_mwe.HalfCheetahEnv(
+      exclude_current_positions_from_observation=True
+    )
+    env1 = HalfCheetahGymEnvPool(
+      HalfCheetahEnvSpec(
+        HalfCheetahEnvSpec.gen_config(
+          exclude_current_positions_from_observation=True
+        )
+      )
+    )
+    self.run_space_check(env0, env1)
     self.run_deterministic_check(HalfCheetahEnvSpec, HalfCheetahGymEnvPool)
 
   def test_hopper(self) -> None:
@@ -146,6 +172,20 @@ def test_hopper(self) -> None:
     env1 = HopperGymEnvPool(HopperEnvSpec(HopperEnvSpec.gen_config()))
     self.run_space_check(env0, env1)
     self.run_align_check(env0, env1)
+    env0 = mjc_mwe.HopperEnv(
+      terminate_when_unhealthy=False,
+      exclude_current_positions_from_observation=False,
+    )
+    env1 = HopperGymEnvPool(
+      HopperEnvSpec(
+        HopperEnvSpec.gen_config(
+          terminate_when_unhealthy=False,
+          exclude_current_positions_from_observation=False,
+        )
+      )
+    )
+    self.run_space_check(env0, env1)
+    self.run_align_check(env0, env1, no_time_limit=True)
     self.run_deterministic_check(HopperEnvSpec, HopperGymEnvPool)
 
   def test_humanoid(self) -> None:
@@ -153,6 +193,20 @@ def test_humanoid(self) -> None:
     env1 = HumanoidGymEnvPool(HumanoidEnvSpec(HumanoidEnvSpec.gen_config()))
     self.run_space_check(env0, env1)
     self.run_align_check(env0, env1)
+    env0 = mjc_mwe.HumanoidEnv(
+      terminate_when_unhealthy=False,
+      exclude_current_positions_from_observation=False,
+    )
+    env1 = HumanoidGymEnvPool(
+      HumanoidEnvSpec(
+        HumanoidEnvSpec.gen_config(
+          terminate_when_unhealthy=False,
+          exclude_current_positions_from_observation=False,
+        )
+      )
+    )
+    self.run_space_check(env0, env1)
+    self.run_align_check(env0, env1, no_time_limit=True)
     self.run_deterministic_check(HumanoidEnvSpec, HumanoidGymEnvPool)
 
   def test_humanoid_standup(self) -> None:
@@ -209,12 +263,36 @@ def test_swimmer(self) -> None:
     env1 = SwimmerGymEnvPool(SwimmerEnvSpec(SwimmerEnvSpec.gen_config()))
     self.run_space_check(env0, env1)
     self.run_align_check(env0, env1, no_time_limit=True)
+    env0 = mjc_mwe.SwimmerEnv(exclude_current_positions_from_observation=False)
+    env1 = SwimmerGymEnvPool(
+      SwimmerEnvSpec(
+        SwimmerEnvSpec.gen_config(
+          exclude_current_positions_from_observation=False
+        )
+      )
+    )
+    self.run_space_check(env0, env1)
     self.run_deterministic_check(SwimmerEnvSpec, SwimmerGymEnvPool)
 
   def test_walker2d(self) -> None:
     env0 = mjc_mwe.Walker2dEnv()
     env1 = Walker2dGymEnvPool(Walker2dEnvSpec(Walker2dEnvSpec.gen_config()))
     self.run_space_check(env0, env1)
+    self.run_align_check(env0, env1)
+    env0 = mjc_mwe.Walker2dEnv(
+      terminate_when_unhealthy=False,
+      exclude_current_positions_from_observation=False,
+    )
+    env1 = Walker2dGymEnvPool(
+      Walker2dEnvSpec(
+        Walker2dEnvSpec.gen_config(
+          terminate_when_unhealthy=False,
+          exclude_current_positions_from_observation=False,
+          max_episode_steps=100,
+        )
+      )
+    )
+    self.run_space_check(env0, env1)
     self.run_align_check(env0, env1, no_time_limit=True)
     self.run_deterministic_check(Walker2dEnvSpec, Walker2dGymEnvPool)
 
diff --git a/envpool/mujoco/pusher.h b/envpool/mujoco/pusher.h
index c121848d..97fab1b8 100644
--- a/envpool/mujoco/pusher.h
+++ b/envpool/mujoco/pusher.h
@@ -44,11 +44,12 @@ class PusherEnvFns {
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
     return MakeDict("obs"_.bind(Spec<mjtNum>({23}, {-inf, inf})),
-                    "info:reward_dist"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
+#ifdef ENVPOOL_TEST
                     "info:qpos0"_.bind(Spec<mjtNum>({11})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({11})));
+                    "info:qvel0"_.bind(Spec<mjtNum>({11})),
+#endif
+                    "info:reward_dist"_.bind(Spec<mjtNum>({-1})),
+                    "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -60,50 +61,43 @@ typedef class EnvSpec<PusherEnvFns> PusherEnvSpec;
 
 class PusherEnv : public Env<PusherEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
   mjtNum ctrl_cost_weight_, dist_cost_weight_, near_cost_weight_;
   mjtNum cylinder_dist_min_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_x_, dist_qpos_y_, dist_qvel_;
-  bool done_;
 
  public:
   PusherEnv(const Spec& spec, int env_id)
       : Env<PusherEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/pusher.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         dist_cost_weight_(spec.config["dist_cost_weight"_]),
         near_cost_weight_(spec.config["near_cost_weight"_]),
         cylinder_dist_min_(spec.config["cylinder_dist_min"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_x_(spec.config["cylinder_x_min"_],
                      spec.config["cylinder_x_max"_]),
         dist_qpos_y_(spec.config["cylinder_y_min"_],
                      spec.config["cylinder_y_max"_]),
         dist_qvel_(-spec.config["reset_qvel_scale"_],
-                   spec.config["reset_qvel_scale"_]),
-        done_(true) {}
+                   spec.config["reset_qvel_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq - 4; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i];
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i];
     }
     while (1) {
       mjtNum x = dist_qpos_x_(gen_), y = dist_qpos_y_(gen_);
       if (std::sqrt(x * x + y * y) > cylinder_dist_min_) {
-        data_->qpos[model_->nq - 4] = qpos0_.get()[model_->nq - 4] = x;
-        data_->qpos[model_->nq - 3] = qpos0_.get()[model_->nq - 3] = y;
-        data_->qpos[model_->nq - 2] = qpos0_.get()[model_->nq - 2] = 0.0;
-        data_->qpos[model_->nq - 1] = qpos0_.get()[model_->nq - 1] = 0.0;
+        data_->qpos[model_->nq - 4] = qpos0_[model_->nq - 4] = x;
+        data_->qpos[model_->nq - 3] = qpos0_[model_->nq - 3] = y;
+        data_->qpos[model_->nq - 2] = qpos0_[model_->nq - 2] = 0.0;
+        data_->qpos[model_->nq - 1] = qpos0_[model_->nq - 1] = 0.0;
         break;
       }
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] =
+      data_->qvel[i] = qvel0_[i] =
           i < model_->nv - 4 ? init_qvel_[i] + dist_qvel_(gen_) : 0.0;
     }
   }
@@ -163,8 +157,10 @@ class PusherEnv : public Env<PusherEnvSpec>, public MujocoEnv {
     // info
     state["info:reward_dist"_] = -dist_cost;
     state["info:reward_ctrl"_] = -ctrl_cost;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/reacher.h b/envpool/mujoco/reacher.h
index 2b236800..567eb4c1 100644
--- a/envpool/mujoco/reacher.h
+++ b/envpool/mujoco/reacher.h
@@ -42,11 +42,12 @@ class ReacherEnvFns {
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
     return MakeDict("obs"_.bind(Spec<mjtNum>({11}, {-inf, inf})),
-                    "info:reward_dist"_.bind(Spec<mjtNum>({-1})),
-                    "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
+#ifdef ENVPOOL_TEST
                     "info:qpos0"_.bind(Spec<mjtNum>({4})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({4})));
+                    "info:qvel0"_.bind(Spec<mjtNum>({4})),
+#endif
+                    "info:reward_dist"_.bind(Spec<mjtNum>({-1})),
+                    "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -58,47 +59,40 @@ typedef class EnvSpec<ReacherEnvFns> ReacherEnvSpec;
 
 class ReacherEnv : public Env<ReacherEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
   mjtNum ctrl_cost_weight_, dist_cost_weight_;
   mjtNum reset_goal_scale_, dist_x_, dist_y_, dist_z_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_qpos_, dist_qvel_, dist_goal_;
-  bool done_;
 
  public:
   ReacherEnv(const Spec& spec, int env_id)
       : Env<ReacherEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/reacher.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         dist_cost_weight_(spec.config["dist_cost_weight"_]),
         reset_goal_scale_(spec.config["reset_goal_scale"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_qpos_(-spec.config["reset_qpos_scale"_],
                    spec.config["reset_qpos_scale"_]),
         dist_qvel_(-spec.config["reset_qvel_scale"_],
                    spec.config["reset_qvel_scale"_]),
         dist_goal_(-spec.config["reset_goal_scale"_],
-                   spec.config["reset_goal_scale"_]),
-        done_(true) {}
+                   spec.config["reset_goal_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq - 2; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
     }
     while (1) {
       mjtNum x = dist_goal_(gen_), y = dist_goal_(gen_);
       if (std::sqrt(x * x + y * y) < reset_goal_scale_) {
-        data_->qpos[model_->nq - 2] = qpos0_.get()[model_->nq - 2] = x;
-        data_->qpos[model_->nq - 1] = qpos0_.get()[model_->nq - 1] = y;
+        data_->qpos[model_->nq - 2] = qpos0_[model_->nq - 2] = x;
+        data_->qpos[model_->nq - 1] = qpos0_[model_->nq - 1] = y;
         break;
       }
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] =
+      data_->qvel[i] = qvel0_[i] =
           i < model_->nv - 2 ? init_qvel_[i] + dist_qvel_(gen_) : 0.0;
     }
   }
@@ -163,8 +157,10 @@ class ReacherEnv : public Env<ReacherEnvSpec>, public MujocoEnv {
     // info
     state["info:reward_dist"_] = -dist_cost;
     state["info:reward_ctrl"_] = -ctrl_cost;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/registration.py b/envpool/mujoco/registration.py
index cb02422c..d56f478a 100644
--- a/envpool/mujoco/registration.py
+++ b/envpool/mujoco/registration.py
@@ -51,7 +51,6 @@
     spec_cls=f"{task}EnvSpec",
     dm_cls=f"{task}DMEnvPool",
     gym_cls=f"{task}GymEnvPool",
-    max_episode_steps=1000,
     base_path=base_path,
     post_constraint=post_constraint,
   )
diff --git a/envpool/mujoco/swimmer.h b/envpool/mujoco/swimmer.h
index 991ac618..b3d4a70e 100644
--- a/envpool/mujoco/swimmer.h
+++ b/envpool/mujoco/swimmer.h
@@ -34,23 +34,26 @@ class SwimmerEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(360.0),
         "frame_skip"_.bind(4), "post_constraint"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(1e-4),
         "reset_noise_scale"_.bind(0.1));
   }
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({8}, {-inf, inf})),
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict("obs"_.bind(Spec<mjtNum>({no_pos ? 8 : 10}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
+                    "info:qpos0"_.bind(Spec<mjtNum>({5})),
+                    "info:qvel0"_.bind(Spec<mjtNum>({5})),
+#endif
                     "info:reward_fwd"_.bind(Spec<mjtNum>({-1})),
                     "info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
                     "info:x_position"_.bind(Spec<mjtNum>({-1})),
                     "info:y_position"_.bind(Spec<mjtNum>({-1})),
                     "info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
                     "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    "info:y_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
-                    "info:qpos0"_.bind(Spec<mjtNum>({5})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({5})));
+                    "info:y_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -62,33 +65,28 @@ typedef class EnvSpec<SwimmerEnvFns> SwimmerEnvSpec;
 
 class SwimmerEnv : public Env<SwimmerEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool no_pos_;
   mjtNum ctrl_cost_weight_, forward_reward_weight_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_;
-  bool done_;
 
  public:
   SwimmerEnv(const Spec& spec, int env_id)
       : Env<SwimmerEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/swimmer.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_(-spec.config["reset_noise_scale"_],
-              spec.config["reset_noise_scale"_]),
-        done_(true) {}
+              spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_);
     }
   }
 
@@ -131,7 +129,7 @@ class SwimmerEnv : public Env<SwimmerEnvSpec>, public MujocoEnv {
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 2; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -146,8 +144,10 @@ class SwimmerEnv : public Env<SwimmerEnvSpec>, public MujocoEnv {
         std::sqrt(x_after * x_after + y_after * y_after);
     state["info:x_velocity"_] = xv;
     state["info:y_velocity"_] = yv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/envpool/mujoco/walker2d.h b/envpool/mujoco/walker2d.h
index 5c486787..147aa266 100644
--- a/envpool/mujoco/walker2d.h
+++ b/envpool/mujoco/walker2d.h
@@ -34,6 +34,8 @@ class Walker2dEnvFns {
     return MakeDict(
         "max_episode_steps"_.bind(1000), "frame_skip"_.bind(4),
         "post_constraint"_.bind(true), "ctrl_cost_weight"_.bind(0.001),
+        "terminate_when_unhealthy"_.bind(true),
+        "exclude_current_positions_from_observation"_.bind(true),
         "forward_reward_weight"_.bind(1.0), "healthy_reward"_.bind(1.0),
         "healthy_z_min"_.bind(0.8), "healthy_z_max"_.bind(2.0),
         "healthy_angle_min"_.bind(-1.0), "healthy_angle_max"_.bind(1.0),
@@ -43,12 +45,14 @@ class Walker2dEnvFns {
   template <typename Config>
   static decltype(auto) StateSpec(const Config& conf) {
     mjtNum inf = std::numeric_limits<mjtNum>::infinity();
-    return MakeDict("obs"_.bind(Spec<mjtNum>({17}, {-inf, inf})),
-                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
-                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})),
-                    // TODO(jiayi): remove these two lines for speed
+    bool no_pos = conf["exclude_current_positions_from_observation"_];
+    return MakeDict("obs"_.bind(Spec<mjtNum>({no_pos ? 17 : 18}, {-inf, inf})),
+#ifdef ENVPOOL_TEST
                     "info:qpos0"_.bind(Spec<mjtNum>({9})),
-                    "info:qvel0"_.bind(Spec<mjtNum>({9})));
+                    "info:qvel0"_.bind(Spec<mjtNum>({9})),
+#endif
+                    "info:x_position"_.bind(Spec<mjtNum>({-1})),
+                    "info:x_velocity"_.bind(Spec<mjtNum>({-1})));
   }
   template <typename Config>
   static decltype(auto) ActionSpec(const Config& conf) {
@@ -60,22 +64,21 @@ typedef class EnvSpec<Walker2dEnvFns> Walker2dEnvSpec;
 
 class Walker2dEnv : public Env<Walker2dEnvSpec>, public MujocoEnv {
  protected:
-  int max_episode_steps_, elapsed_step_;
+  bool terminate_when_unhealthy_, no_pos_;
   mjtNum ctrl_cost_weight_, forward_reward_weight_;
   mjtNum healthy_reward_, healthy_z_min_, healthy_z_max_;
   mjtNum healthy_angle_min_, healthy_angle_max_;
   mjtNum velocity_min_, velocity_max_;
-  std::unique_ptr<mjtNum> qpos0_, qvel0_;  // for align check
   std::uniform_real_distribution<> dist_;
-  bool done_;
 
  public:
   Walker2dEnv(const Spec& spec, int env_id)
       : Env<Walker2dEnvSpec>(spec, env_id),
         MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/walker2d.xml",
-                  spec.config["frame_skip"_], spec.config["post_constraint"_]),
-        max_episode_steps_(spec.config["max_episode_steps"_]),
-        elapsed_step_(max_episode_steps_ + 1),
+                  spec.config["frame_skip"_], spec.config["post_constraint"_],
+                  spec.config["max_episode_steps"_]),
+        terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]),
+        no_pos_(spec.config["exclude_current_positions_from_observation"_]),
         ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
         forward_reward_weight_(spec.config["forward_reward_weight"_]),
         healthy_reward_(spec.config["healthy_reward"_]),
@@ -85,18 +88,15 @@ class Walker2dEnv : public Env<Walker2dEnvSpec>, public MujocoEnv {
         healthy_angle_max_(spec.config["healthy_angle_max"_]),
         velocity_min_(spec.config["velocity_min"_]),
         velocity_max_(spec.config["velocity_max"_]),
-        qpos0_(new mjtNum[model_->nq]),
-        qvel0_(new mjtNum[model_->nv]),
         dist_(-spec.config["reset_noise_scale"_],
-              spec.config["reset_noise_scale"_]),
-        done_(true) {}
+              spec.config["reset_noise_scale"_]) {}
 
   void MujocoResetModel() {
     for (int i = 0; i < model_->nq; ++i) {
-      data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_);
+      data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_);
     }
     for (int i = 0; i < model_->nv; ++i) {
-      data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_);
+      data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_);
     }
   }
 
@@ -125,9 +125,12 @@ class Walker2dEnv : public Env<Walker2dEnvSpec>, public MujocoEnv {
     mjtNum dt = frame_skip_ * model_->opt.timestep;
     mjtNum xv = (x_after - x_before) / dt;
     // reward and done
-    float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost;
+    mjtNum healthy_reward =
+        terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0;
+    float reward = xv * forward_reward_weight_ + healthy_reward - ctrl_cost;
     ++elapsed_step_;
-    done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_);
+    done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) ||
+            (elapsed_step_ >= max_episode_steps_);
     WriteObs(reward, xv, x_after);
   }
 
@@ -148,7 +151,7 @@ class Walker2dEnv : public Env<Walker2dEnvSpec>, public MujocoEnv {
     state["reward"_] = reward;
     // obs
     mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
-    for (int i = 1; i < model_->nq; ++i) {
+    for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) {
       *(obs++) = data_->qpos[i];
     }
     for (int i = 0; i < model_->nv; ++i) {
@@ -160,8 +163,10 @@ class Walker2dEnv : public Env<Walker2dEnvSpec>, public MujocoEnv {
     // info
     state["info:x_position"_] = x_after;
     state["info:x_velocity"_] = xv;
-    state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
-    state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
+#ifdef ENVPOOL_TEST
+    state["info:qpos0"_].Assign(qpos0_, model_->nq);
+    state["info:qvel0"_].Assign(qvel0_, model_->nv);
+#endif
   }
 };
 
diff --git a/examples/make_env.py b/examples/make_env.py
index caf1a13e..660621e2 100644
--- a/examples/make_env.py
+++ b/examples/make_env.py
@@ -58,6 +58,13 @@ def make_spec() -> None:
   assert dm_act_spec.num_values, 6
 
 
+def check_info_optim() -> None:
+  env = envpool.make_gym("Ant-v3")
+  info = env.step(np.array([env.action_space.sample()]), np.array([0]))[-1]
+  assert "qpos0" not in info and "qvel0" not in info
+
+
 if __name__ == "__main__":
   make_env()
   make_spec()
+  check_info_optim()