From 8a269d86f20722a6375ac566675a9fdeaf226fe5 Mon Sep 17 00:00:00 2001 From: Jiayi Weng Date: Sat, 30 Apr 2022 06:02:39 +0800 Subject: [PATCH] Align with gym mujoco options (#93) 1. simplify code and fix a bug of max_episode_steps 2. add `terminate_when_unhealthy` and `exclude_current_positions_from_observation` for most of mujoco envs 3. add testing macro `ENVPOOL_TEST` 4. eliminate `info["qpos0"]` and `info["qvel0"]` when generating wheel --- .bazelrc | 3 +- Makefile | 4 +- docs/conf.py | 2 +- docs/pages/contributing.rst | 3 + docs/pages/env.rst | 11 ++++ docs/pages/interface.rst | 4 +- envpool/core/env.h | 10 +-- envpool/mujoco/ant.h | 79 ++++++++++++----------- envpool/mujoco/half_cheetah.h | 40 ++++++------ envpool/mujoco/hopper.h | 49 +++++++------- envpool/mujoco/humanoid.h | 78 ++++++++++++---------- envpool/mujoco/humanoid_standup.h | 47 +++++++------- envpool/mujoco/inverted_double_pendulum.h | 28 ++++---- envpool/mujoco/inverted_pendulum.h | 28 ++++---- envpool/mujoco/mujoco_env.h | 15 ++++- envpool/mujoco/mujoco_test.py | 78 ++++++++++++++++++++++ envpool/mujoco/pusher.h | 40 ++++++------ envpool/mujoco/reacher.h | 36 +++++------ envpool/mujoco/registration.py | 1 - envpool/mujoco/swimmer.h | 40 ++++++------ envpool/mujoco/walker2d.h | 49 +++++++------- examples/make_env.py | 7 ++ 22 files changed, 388 insertions(+), 264 deletions(-) diff --git a/.bazelrc b/.bazelrc index fcb2ce7a..f3fdebe0 100644 --- a/.bazelrc +++ b/.bazelrc @@ -2,7 +2,8 @@ build --action_env=BAZEL_LINKLIBS=-l%:libstdc++.a:-lm build --action_env=BAZEL_LINKOPTS=-static-libgcc build --incompatible_strict_action_env --cxxopt=-std=c++17 --host_cxxopt=-std=c++17 --client_env=BAZEL_CXXOPTS=-std=c++17 build:release --copt=-g0 --copt=-O3 --copt=-DNDEBUG --copt=-msse --copt=-msse2 --copt=-mmmx -build:debug --compilation_mode=dbg -s +build:debug --cxxopt=-DENVPOOL_TEST --compilation_mode=dbg -s +build:test --cxxopt=-DENVPOOL_TEST build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_tidy_aspect build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config diff --git a/Makefile b/Makefile index 6d2fadd5..05df27a3 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ bazel-build: bazel-install cp bazel-bin/setup.runfiles/$(PROJECT_NAME)/dist/*.whl ./dist bazel-test: bazel-install - bazel test --test_output=all $(BAZELOPT) //... --config=release + bazel test --test_output=all $(BAZELOPT) //... --config=release --config=test bazel-clean: bazel-install bazel clean --expunge @@ -154,7 +154,7 @@ release-test1: cd envpool && python3 make_test.py release-test2: - cd examples && python3 env_step.py + cd examples && python3 make_env.py && python3 env_step.py release-test: release-test1 release-test2 diff --git a/docs/conf.py b/docs/conf.py index 63e1fa7f..9e68bd2d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,7 +29,7 @@ def get_version() -> str: # -- Project information ----------------------------------------------------- project = "EnvPool" -copyright = "2021, Garena Online Private Limited" +copyright = "2022, Garena Online Private Limited" author = "EnvPool Contributors" # The full version, including alpha/beta/rc tags diff --git a/docs/pages/contributing.rst b/docs/pages/contributing.rst index 6e753a4e..f5d03f79 100644 --- a/docs/pages/contributing.rst +++ b/docs/pages/contributing.rst @@ -66,6 +66,9 @@ integration; however, you don't want to build other stuff such as OpenCV: .. code-block:: bash bazel test --test_output=all //envpool/mujoco:mujoco_test --config=release + # or alternatively + cd bazel-bin/envpool/mujoco/mujoco_test.runfiles/envpool/ + ./envpool/mujoco/mujoco_test Feel free to customize the command in ``Makefile``! diff --git a/docs/pages/env.rst b/docs/pages/env.rst index 8c2744f5..67e0dc59 100644 --- a/docs/pages/env.rst +++ b/docs/pages/env.rst @@ -389,6 +389,17 @@ Miscellaneous has already been defined as ``gen_`` (`link `_). +.. note :: + + ``ENVPOOL_TEST`` is a test-time macro. If you want a piece of C++ code only + available during unit test: + + .. code-block:: c++ + + #ifdef ENVPOOL_TEST + fprintf(stderr, "here"); + #endif + Generate Dynamic Linked .so File and Instantiate in Python ---------------------------------------------------------- diff --git a/docs/pages/interface.rst b/docs/pages/interface.rst index 98a4c611..e098a254 100644 --- a/docs/pages/interface.rst +++ b/docs/pages/interface.rst @@ -1,5 +1,5 @@ -Interface -========= +Python Interface +================ envpool.make ------------ diff --git a/envpool/core/env.h b/envpool/core/env.h index 5e363632..8b3583a2 100644 --- a/envpool/core/env.h +++ b/envpool/core/env.h @@ -38,7 +38,7 @@ class Env { private: StateBufferQueue* sbq_; - int order_, elapsed_step_; + int order_, current_step_; bool is_single_player_; StateBuffer::WritableSlice slice_; // for parsing single env action from input action batch @@ -60,7 +60,7 @@ class Env { env_id_(env_id), seed_(spec.config["seed"_] + env_id), gen_(seed_), - elapsed_step_(-1), + current_step_(-1), is_single_player_(max_num_players_ == 1), action_specs_(spec.action_spec.template values()), is_player_action_(Transform(action_specs_, [](const ShapeSpec& s) { @@ -147,9 +147,9 @@ class Env { sbq_ = sbq; order_ = order; if (reset) { - elapsed_step_ = 0; + current_step_ = 0; } else { - ++elapsed_step_; + ++current_step_; } } @@ -163,7 +163,7 @@ class Env { State state(&slice_.arr); state["done"_] = IsDone(); state["info:env_id"_] = env_id_; - state["elapsed_step"_] = elapsed_step_; + state["elapsed_step"_] = current_step_; int* player_env_id(static_cast(state["info:players.env_id"_].data())); for (int i = 0; i < player_num; ++i) { player_env_id[i] = env_id_; diff --git a/envpool/mujoco/ant.h b/envpool/mujoco/ant.h index b371ccba..225d16c1 100644 --- a/envpool/mujoco/ant.h +++ b/envpool/mujoco/ant.h @@ -34,6 +34,8 @@ class AntEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0), "frame_skip"_.bind(5), "post_constraint"_.bind(true), + "terminate_when_unhealthy"_.bind(true), + "exclude_current_positions_from_observation"_.bind(true), "forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(0.5), "contact_cost_weight"_.bind(5e-4), "healthy_reward"_.bind(1.0), "healthy_z_min"_.bind(0.2), "healthy_z_max"_.bind(1.0), @@ -43,19 +45,22 @@ class AntEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({111}, {-inf, inf})), - "info:reward_forward"_.bind(Spec({-1})), - "info:reward_ctrl"_.bind(Spec({-1})), - "info:reward_contact"_.bind(Spec({-1})), - "info:reward_survive"_.bind(Spec({-1})), - "info:x_position"_.bind(Spec({-1})), - "info:y_position"_.bind(Spec({-1})), - "info:distance_from_origin"_.bind(Spec({-1})), - "info:x_velocity"_.bind(Spec({-1})), - "info:y_velocity"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed - "info:qpos0"_.bind(Spec({15})), - "info:qvel0"_.bind(Spec({14}))); + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict( + "obs"_.bind(Spec({no_pos ? 111 : 113}, {-inf, inf})), +#ifdef ENVPOOL_TEST + "info:qpos0"_.bind(Spec({15})), + "info:qvel0"_.bind(Spec({14})), +#endif + "info:reward_forward"_.bind(Spec({-1})), + "info:reward_ctrl"_.bind(Spec({-1})), + "info:reward_contact"_.bind(Spec({-1})), + "info:reward_survive"_.bind(Spec({-1})), + "info:x_position"_.bind(Spec({-1})), + "info:y_position"_.bind(Spec({-1})), + "info:distance_from_origin"_.bind(Spec({-1})), + "info:x_velocity"_.bind(Spec({-1})), + "info:y_velocity"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -67,23 +72,22 @@ typedef class EnvSpec AntEnvSpec; class AntEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool terminate_when_unhealthy_, no_pos_; mjtNum ctrl_cost_weight_, contact_cost_weight_; mjtNum forward_reward_weight_, healthy_reward_; mjtNum healthy_z_min_, healthy_z_max_; mjtNum contact_force_min_, contact_force_max_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_qpos_; std::normal_distribution<> dist_qvel_; - bool done_; public: AntEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/ant.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), contact_cost_weight_(spec.config["contact_cost_weight"_]), forward_reward_weight_(spec.config["forward_reward_weight"_]), @@ -92,19 +96,16 @@ class AntEnv : public Env, public MujocoEnv { healthy_z_max_(spec.config["healthy_z_max"_]), contact_force_min_(spec.config["contact_force_min"_]), contact_force_max_(spec.config["contact_force_max"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_qpos_(-spec.config["reset_noise_scale"_], spec.config["reset_noise_scale"_]), - dist_qvel_(0, spec.config["reset_noise_scale"_]), - done_(true) {} + dist_qvel_(0, spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_); } } @@ -114,7 +115,7 @@ class AntEnv : public Env, public MujocoEnv { done_ = false; elapsed_step_ = 0; MujocoReset(); - WriteObs(0.0f, 0, 0, 0, 0, 0, 0); + WriteObs(0.0f, 0, 0, 0, 0, 0, 0, 0); } void Step(const Action& action) override { @@ -141,13 +142,16 @@ class AntEnv : public Env, public MujocoEnv { x = std::max(contact_force_min_, x); contact_cost += contact_cost_weight_ * x * x; } - // reward and done - float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost - - contact_cost; + mjtNum healthy_reward = + terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0; + float reward = + xv * forward_reward_weight_ + healthy_reward - ctrl_cost - contact_cost; ++elapsed_step_; - done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_); - WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after); + done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) || + (elapsed_step_ >= max_episode_steps_); + WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after, + healthy_reward); } private: @@ -169,12 +173,13 @@ class AntEnv : public Env, public MujocoEnv { } void WriteObs(float reward, mjtNum xv, mjtNum yv, mjtNum ctrl_cost, - mjtNum contact_cost, mjtNum x_after, mjtNum y_after) { + mjtNum contact_cost, mjtNum x_after, mjtNum y_after, + mjtNum healthy_reward) { State state = Allocate(); state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 2; i < model_->nq; ++i) { + for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -190,15 +195,17 @@ class AntEnv : public Env, public MujocoEnv { state["info:reward_forward"_] = xv * forward_reward_weight_; state["info:reward_ctrl"_] = -ctrl_cost; state["info:reward_contact"_] = -contact_cost; - state["info:reward_survive"_] = healthy_reward_; + state["info:reward_survive"_] = healthy_reward; state["info:x_position"_] = x_after; state["info:y_position"_] = y_after; state["info:distance_from_origin"_] = std::sqrt(x_after * x_after + y_after * y_after); state["info:x_velocity"_] = xv; state["info:y_velocity"_] = yv; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/half_cheetah.h b/envpool/mujoco/half_cheetah.h index e530b9f7..322a484a 100644 --- a/envpool/mujoco/half_cheetah.h +++ b/envpool/mujoco/half_cheetah.h @@ -34,20 +34,23 @@ class HalfCheetahEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(4800.0), "frame_skip"_.bind(5), "post_constraint"_.bind(true), + "exclude_current_positions_from_observation"_.bind(true), "ctrl_cost_weight"_.bind(0.1), "forward_reward_weight"_.bind(1.0), "reset_noise_scale"_.bind(0.1)); } template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({17}, {-inf, inf})), + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict("obs"_.bind(Spec({no_pos ? 17 : 18}, {-inf, inf})), +#ifdef ENVPOOL_TEST + "info:qpos0"_.bind(Spec({9})), + "info:qvel0"_.bind(Spec({9})), +#endif "info:reward_run"_.bind(Spec({-1})), "info:reward_ctrl"_.bind(Spec({-1})), "info:x_position"_.bind(Spec({-1})), - "info:x_velocity"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed - "info:qpos0"_.bind(Spec({9})), - "info:qvel0"_.bind(Spec({9}))); + "info:x_velocity"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -59,35 +62,30 @@ typedef class EnvSpec HalfCheetahEnvSpec; class HalfCheetahEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool no_pos_; mjtNum ctrl_cost_weight_, forward_reward_weight_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_qpos_; std::normal_distribution<> dist_qvel_; - bool done_; public: HalfCheetahEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/half_cheetah.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), forward_reward_weight_(spec.config["forward_reward_weight"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_qpos_(-spec.config["reset_noise_scale"_], spec.config["reset_noise_scale"_]), - dist_qvel_(0, spec.config["reset_noise_scale"_]), - done_(true) {} + dist_qvel_(0, spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_); } } @@ -127,7 +125,7 @@ class HalfCheetahEnv : public Env, public MujocoEnv { state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 1; i < model_->nq; ++i) { + for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -138,8 +136,10 @@ class HalfCheetahEnv : public Env, public MujocoEnv { state["info:reward_ctrl"_] = -ctrl_cost; state["info:x_position"_] = x_after; state["info:x_velocity"_] = xv; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/hopper.h b/envpool/mujoco/hopper.h index 30b47a41..e6b72488 100644 --- a/envpool/mujoco/hopper.h +++ b/envpool/mujoco/hopper.h @@ -34,6 +34,8 @@ class HopperEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0), "frame_skip"_.bind(4), "post_constraint"_.bind(true), + "terminate_when_unhealthy"_.bind(true), + "exclude_current_positions_from_observation"_.bind(true), "ctrl_cost_weight"_.bind(1e-3), "forward_reward_weight"_.bind(1.0), "healthy_reward"_.bind(1.0), "velocity_min"_.bind(-10.0), "velocity_max"_.bind(10.0), "healthy_state_min"_.bind(-100.0), @@ -44,12 +46,14 @@ class HopperEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({11}, {-inf, inf})), - "info:x_position"_.bind(Spec({-1})), - "info:x_velocity"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict("obs"_.bind(Spec({no_pos ? 11 : 12}, {-inf, inf})), +#ifdef ENVPOOL_TEST "info:qpos0"_.bind(Spec({6})), - "info:qvel0"_.bind(Spec({6}))); + "info:qvel0"_.bind(Spec({6})), +#endif + "info:x_position"_.bind(Spec({-1})), + "info:x_velocity"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -61,23 +65,22 @@ typedef class EnvSpec HopperEnvSpec; class HopperEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool terminate_when_unhealthy_, no_pos_; mjtNum ctrl_cost_weight_, forward_reward_weight_; mjtNum healthy_reward_, healthy_z_min_; mjtNum velocity_min_, velocity_max_; mjtNum healthy_state_min_, healthy_state_max_; mjtNum healthy_angle_min_, healthy_angle_max_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_; - bool done_; public: HopperEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/hopper.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), forward_reward_weight_(spec.config["forward_reward_weight"_]), healthy_reward_(spec.config["healthy_reward"_]), @@ -88,18 +91,15 @@ class HopperEnv : public Env, public MujocoEnv { healthy_state_max_(spec.config["healthy_state_max"_]), healthy_angle_min_(spec.config["healthy_angle_min"_]), healthy_angle_max_(spec.config["healthy_angle_max"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_(-spec.config["reset_noise_scale"_], - spec.config["reset_noise_scale"_]), - done_(true) {} + spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_); } } @@ -128,9 +128,12 @@ class HopperEnv : public Env, public MujocoEnv { mjtNum dt = frame_skip_ * model_->opt.timestep; mjtNum xv = (x_after - x_before) / dt; // reward and done - float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost; + mjtNum healthy_reward = + terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0; + float reward = xv * forward_reward_weight_ + healthy_reward - ctrl_cost; ++elapsed_step_; - done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_); + done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) || + (elapsed_step_ >= max_episode_steps_); WriteObs(reward, xv, x_after); } @@ -161,7 +164,7 @@ class HopperEnv : public Env, public MujocoEnv { state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 1; i < model_->nq; ++i) { + for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -173,8 +176,10 @@ class HopperEnv : public Env, public MujocoEnv { // info state["info:x_position"_] = x_after; state["info:x_velocity"_] = xv; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/humanoid.h b/envpool/mujoco/humanoid.h index 249e4496..9c8c3d96 100644 --- a/envpool/mujoco/humanoid.h +++ b/envpool/mujoco/humanoid.h @@ -34,6 +34,8 @@ class HumanoidEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "frame_skip"_.bind(5), "post_constraint"_.bind(true), "forward_reward_weight"_.bind(1.25), + "terminate_when_unhealthy"_.bind(true), + "exclude_current_positions_from_observation"_.bind(true), "ctrl_cost_weight"_.bind(0.1), "contact_cost_weight"_.bind(5e-7), "contact_cost_max"_.bind(10.0), "healthy_reward"_.bind(5.0), "healthy_z_min"_.bind(1.0), "healthy_z_max"_.bind(2.0), @@ -42,19 +44,22 @@ class HumanoidEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({376}, {-inf, inf})), - "info:reward_linvel"_.bind(Spec({-1})), - "info:reward_quadctrl"_.bind(Spec({-1})), - "info:reward_alive"_.bind(Spec({-1})), - "info:reward_impact"_.bind(Spec({-1})), - "info:x_position"_.bind(Spec({-1})), - "info:y_position"_.bind(Spec({-1})), - "info:distance_from_origin"_.bind(Spec({-1})), - "info:x_velocity"_.bind(Spec({-1})), - "info:y_velocity"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed - "info:qpos0"_.bind(Spec({24})), - "info:qvel0"_.bind(Spec({23}))); + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict( + "obs"_.bind(Spec({no_pos ? 376 : 378}, {-inf, inf})), +#ifdef ENVPOOL_TEST + "info:qpos0"_.bind(Spec({24})), + "info:qvel0"_.bind(Spec({23})), +#endif + "info:reward_linvel"_.bind(Spec({-1})), + "info:reward_quadctrl"_.bind(Spec({-1})), + "info:reward_alive"_.bind(Spec({-1})), + "info:reward_impact"_.bind(Spec({-1})), + "info:x_position"_.bind(Spec({-1})), + "info:y_position"_.bind(Spec({-1})), + "info:distance_from_origin"_.bind(Spec({-1})), + "info:x_velocity"_.bind(Spec({-1})), + "info:y_velocity"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -66,22 +71,21 @@ typedef class EnvSpec HumanoidEnvSpec; class HumanoidEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool terminate_when_unhealthy_, no_pos_; mjtNum ctrl_cost_weight_, contact_cost_weight_, contact_cost_max_; mjtNum forward_reward_weight_, healthy_reward_; mjtNum healthy_z_min_, healthy_z_max_; mjtNum mass_x_, mass_y_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_; - bool done_; public: HumanoidEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/humanoid.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), contact_cost_weight_(spec.config["contact_cost_weight"_]), contact_cost_max_(spec.config["contact_cost_max"_]), @@ -91,18 +95,15 @@ class HumanoidEnv : public Env, public MujocoEnv { healthy_z_max_(spec.config["healthy_z_max"_]), mass_x_(0), mass_y_(0), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_(-spec.config["reset_noise_scale"_], - spec.config["reset_noise_scale"_]), - done_(true) {} + spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_); } } @@ -112,7 +113,7 @@ class HumanoidEnv : public Env, public MujocoEnv { done_ = false; elapsed_step_ = 0; MujocoReset(); - WriteObs(0.0f, 0, 0, 0, 0, 0, 0); + WriteObs(0.0f, 0, 0, 0, 0, 0, 0, 0); } void Step(const Action& action) override { @@ -142,11 +143,15 @@ class HumanoidEnv : public Env, public MujocoEnv { contact_cost = std::min(contact_cost, contact_cost_max_); // reward and done - float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost - - contact_cost; + mjtNum healthy_reward = + terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0; + float reward = + xv * forward_reward_weight_ + healthy_reward - ctrl_cost - contact_cost; ++elapsed_step_; - done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_); - WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after); + done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) || + (elapsed_step_ >= max_episode_steps_); + WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after, + healthy_reward); } private: @@ -168,12 +173,13 @@ class HumanoidEnv : public Env, public MujocoEnv { } void WriteObs(float reward, mjtNum xv, mjtNum yv, mjtNum ctrl_cost, - mjtNum contact_cost, mjtNum x_after, mjtNum y_after) { + mjtNum contact_cost, mjtNum x_after, mjtNum y_after, + mjtNum healthy_reward) { State state = Allocate(); state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 2; i < model_->nq; ++i) { + for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -195,15 +201,17 @@ class HumanoidEnv : public Env, public MujocoEnv { state["info:reward_linvel"_] = xv * forward_reward_weight_; state["info:reward_quadctrl"_] = -ctrl_cost; state["info:reward_impact"_] = -contact_cost; - state["info:reward_alive"_] = healthy_reward_; + state["info:reward_alive"_] = healthy_reward; state["info:x_position"_] = x_after; state["info:y_position"_] = y_after; state["info:distance_from_origin"_] = std::sqrt(x_after * x_after + y_after * y_after); state["info:x_velocity"_] = xv; state["info:y_velocity"_] = yv; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/humanoid_standup.h b/envpool/mujoco/humanoid_standup.h index e2a09945..d8dd314d 100644 --- a/envpool/mujoco/humanoid_standup.h +++ b/envpool/mujoco/humanoid_standup.h @@ -34,6 +34,7 @@ class HumanoidStandupEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "frame_skip"_.bind(5), "post_constraint"_.bind(true), "forward_reward_weight"_.bind(1.0), + "exclude_current_positions_from_observation"_.bind(true), "ctrl_cost_weight"_.bind(0.1), "contact_cost_weight"_.bind(5e-7), "contact_cost_max"_.bind(10.0), "healthy_reward"_.bind(1.0), "reset_noise_scale"_.bind(1e-2)); @@ -41,14 +42,17 @@ class HumanoidStandupEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({376}, {-inf, inf})), - "info:reward_linup"_.bind(Spec({-1})), - "info:reward_quadctrl"_.bind(Spec({-1})), - "info:reward_alive"_.bind(Spec({-1})), - "info:reward_impact"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed - "info:qpos0"_.bind(Spec({24})), - "info:qvel0"_.bind(Spec({23}))); + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict( +#ifdef ENVPOOL_TEST + "info:qpos0"_.bind(Spec({24})), + "info:qvel0"_.bind(Spec({23})), +#endif + "obs"_.bind(Spec({no_pos ? 376 : 378}, {-inf, inf})), + "info:reward_linup"_.bind(Spec({-1})), + "info:reward_quadctrl"_.bind(Spec({-1})), + "info:reward_alive"_.bind(Spec({-1})), + "info:reward_impact"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -61,38 +65,33 @@ typedef class EnvSpec HumanoidStandupEnvSpec; class HumanoidStandupEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool no_pos_; mjtNum ctrl_cost_weight_, contact_cost_weight_, contact_cost_max_; mjtNum forward_reward_weight_, healthy_reward_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_; - bool done_; public: HumanoidStandupEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv( spec.config["base_path"_] + "/mujoco/assets/humanoidstandup.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), contact_cost_weight_(spec.config["contact_cost_weight"_]), contact_cost_max_(spec.config["contact_cost_max"_]), forward_reward_weight_(spec.config["forward_reward_weight"_]), healthy_reward_(spec.config["healthy_reward"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_(-spec.config["reset_noise_scale"_], - spec.config["reset_noise_scale"_]), - done_(true) {} + spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_); } } @@ -139,7 +138,7 @@ class HumanoidStandupEnv : public Env, state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 2; i < model_->nq; ++i) { + for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -162,8 +161,10 @@ class HumanoidStandupEnv : public Env, state["info:reward_quadctrl"_] = -ctrl_cost; state["info:reward_impact"_] = -contact_cost; state["info:reward_alive"_] = healthy_reward_; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/inverted_double_pendulum.h b/envpool/mujoco/inverted_double_pendulum.h index c57c2195..eba930b9 100644 --- a/envpool/mujoco/inverted_double_pendulum.h +++ b/envpool/mujoco/inverted_double_pendulum.h @@ -41,10 +41,13 @@ class InvertedDoublePendulumEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); +#ifdef ENVPOOL_TEST return MakeDict("obs"_.bind(Spec({11}, {-inf, inf})), - // TODO(jiayi): remove these two lines for speed "info:qpos0"_.bind(Spec({3})), "info:qvel0"_.bind(Spec({3}))); +#else + return MakeDict("obs"_.bind(Spec({11}, {-inf, inf}))); +#endif } template static decltype(auto) ActionSpec(const Config& conf) { @@ -58,39 +61,32 @@ typedef class EnvSpec class InvertedDoublePendulumEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; mjtNum healthy_reward_, healthy_z_max_; mjtNum observation_min_, observation_max_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_qpos_; std::normal_distribution<> dist_qvel_; - bool done_; public: InvertedDoublePendulumEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/inverted_double_pendulum.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), healthy_reward_(spec.config["healthy_reward"_]), healthy_z_max_(spec.config["healthy_z_max"_]), observation_min_(spec.config["observation_min"_]), observation_max_(spec.config["observation_max"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_qpos_(-spec.config["reset_noise_scale"_], spec.config["reset_noise_scale"_]), - dist_qvel_(0, spec.config["reset_noise_scale"_]), - done_(true) {} + dist_qvel_(0, spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_); } } @@ -148,8 +144,10 @@ class InvertedDoublePendulumEnv : public Env, *(obs++) = x; } // info - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/inverted_pendulum.h b/envpool/mujoco/inverted_pendulum.h index aba0896b..cfad7a46 100644 --- a/envpool/mujoco/inverted_pendulum.h +++ b/envpool/mujoco/inverted_pendulum.h @@ -40,10 +40,13 @@ class InvertedPendulumEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); +#ifdef ENVPOOL_TEST return MakeDict("obs"_.bind(Spec({4}, {-inf, inf})), - // TODO(jiayi): remove these two lines for speed "info:qpos0"_.bind(Spec({2})), "info:qvel0"_.bind(Spec({2}))); +#else + return MakeDict("obs"_.bind(Spec({4}, {-inf, inf}))); +#endif } template static decltype(auto) ActionSpec(const Config& conf) { @@ -56,35 +59,28 @@ typedef class EnvSpec InvertedPendulumEnvSpec; class InvertedPendulumEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; mjtNum healthy_reward_, healthy_z_min_, healthy_z_max_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_; - bool done_; public: InvertedPendulumEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv( spec.config["base_path"_] + "/mujoco/assets/inverted_pendulum.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), healthy_reward_(spec.config["healthy_reward"_]), healthy_z_min_(spec.config["healthy_z_min"_]), healthy_z_max_(spec.config["healthy_z_max"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_(-spec.config["reset_noise_scale"_], - spec.config["reset_noise_scale"_]), - done_(true) {} + spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_); } } @@ -137,8 +133,10 @@ class InvertedPendulumEnv : public Env, *(obs++) = data_->qvel[i]; } // info - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/mujoco_env.h b/envpool/mujoco/mujoco_env.h index 71991c97..5e5c3a17 100644 --- a/envpool/mujoco/mujoco_env.h +++ b/envpool/mujoco/mujoco_env.h @@ -30,17 +30,26 @@ class MujocoEnv { mjModel* model_; mjData* data_; mjtNum *init_qpos_, *init_qvel_; + mjtNum *qpos0_, *qvel0_; // for align check int frame_skip_; bool post_constraint_; + int max_episode_steps_, elapsed_step_; + bool done_; public: - MujocoEnv(std::string xml, int frame_skip, bool post_constraint) + MujocoEnv(std::string xml, int frame_skip, bool post_constraint, + int max_episode_steps) : model_(mj_loadXML(xml.c_str(), 0, error_, 1000)), data_(mj_makeData(model_)), init_qpos_(new mjtNum[model_->nq]), init_qvel_(new mjtNum[model_->nv]), + qpos0_(new mjtNum[model_->nq]), + qvel0_(new mjtNum[model_->nv]), frame_skip_(frame_skip), - post_constraint_(post_constraint) { + post_constraint_(post_constraint), + max_episode_steps_(max_episode_steps), + elapsed_step_(max_episode_steps + 1), + done_(true) { memcpy(init_qpos_, data_->qpos, sizeof(mjtNum) * model_->nq); memcpy(init_qvel_, data_->qvel, sizeof(mjtNum) * model_->nv); } @@ -50,6 +59,8 @@ class MujocoEnv { mj_deleteModel(model_); delete[] init_qpos_; delete[] init_qvel_; + delete[] qpos0_; + delete[] qvel0_; } void MujocoReset() { diff --git a/envpool/mujoco/mujoco_test.py b/envpool/mujoco/mujoco_test.py index a59586b6..6f3a1851 100644 --- a/envpool/mujoco/mujoco_test.py +++ b/envpool/mujoco/mujoco_test.py @@ -130,6 +130,21 @@ def test_ant(self) -> None: env1 = AntGymEnvPool(AntEnvSpec(AntEnvSpec.gen_config())) self.run_space_check(env0, env1) self.run_align_check(env0, env1) + env0 = mjc_mwe.AntEnv( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + ) + env1 = AntGymEnvPool( + AntEnvSpec( + AntEnvSpec.gen_config( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + max_episode_steps=100, + ) + ) + ) + self.run_space_check(env0, env1) + self.run_align_check(env0, env1, no_time_limit=True) self.run_deterministic_check(AntEnvSpec, AntGymEnvPool) def test_half_cheetah(self) -> None: @@ -139,6 +154,17 @@ def test_half_cheetah(self) -> None: ) self.run_space_check(env0, env1) self.run_align_check(env0, env1, no_time_limit=True) + env0 = mjc_mwe.HalfCheetahEnv( + exclude_current_positions_from_observation=True + ) + env1 = HalfCheetahGymEnvPool( + HalfCheetahEnvSpec( + HalfCheetahEnvSpec.gen_config( + exclude_current_positions_from_observation=True + ) + ) + ) + self.run_space_check(env0, env1) self.run_deterministic_check(HalfCheetahEnvSpec, HalfCheetahGymEnvPool) def test_hopper(self) -> None: @@ -146,6 +172,20 @@ def test_hopper(self) -> None: env1 = HopperGymEnvPool(HopperEnvSpec(HopperEnvSpec.gen_config())) self.run_space_check(env0, env1) self.run_align_check(env0, env1) + env0 = mjc_mwe.HopperEnv( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + ) + env1 = HopperGymEnvPool( + HopperEnvSpec( + HopperEnvSpec.gen_config( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + ) + ) + ) + self.run_space_check(env0, env1) + self.run_align_check(env0, env1, no_time_limit=True) self.run_deterministic_check(HopperEnvSpec, HopperGymEnvPool) def test_humanoid(self) -> None: @@ -153,6 +193,20 @@ def test_humanoid(self) -> None: env1 = HumanoidGymEnvPool(HumanoidEnvSpec(HumanoidEnvSpec.gen_config())) self.run_space_check(env0, env1) self.run_align_check(env0, env1) + env0 = mjc_mwe.HumanoidEnv( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + ) + env1 = HumanoidGymEnvPool( + HumanoidEnvSpec( + HumanoidEnvSpec.gen_config( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + ) + ) + ) + self.run_space_check(env0, env1) + self.run_align_check(env0, env1, no_time_limit=True) self.run_deterministic_check(HumanoidEnvSpec, HumanoidGymEnvPool) def test_humanoid_standup(self) -> None: @@ -209,12 +263,36 @@ def test_swimmer(self) -> None: env1 = SwimmerGymEnvPool(SwimmerEnvSpec(SwimmerEnvSpec.gen_config())) self.run_space_check(env0, env1) self.run_align_check(env0, env1, no_time_limit=True) + env0 = mjc_mwe.SwimmerEnv(exclude_current_positions_from_observation=False) + env1 = SwimmerGymEnvPool( + SwimmerEnvSpec( + SwimmerEnvSpec.gen_config( + exclude_current_positions_from_observation=False + ) + ) + ) + self.run_space_check(env0, env1) self.run_deterministic_check(SwimmerEnvSpec, SwimmerGymEnvPool) def test_walker2d(self) -> None: env0 = mjc_mwe.Walker2dEnv() env1 = Walker2dGymEnvPool(Walker2dEnvSpec(Walker2dEnvSpec.gen_config())) self.run_space_check(env0, env1) + self.run_align_check(env0, env1) + env0 = mjc_mwe.Walker2dEnv( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + ) + env1 = Walker2dGymEnvPool( + Walker2dEnvSpec( + Walker2dEnvSpec.gen_config( + terminate_when_unhealthy=False, + exclude_current_positions_from_observation=False, + max_episode_steps=100, + ) + ) + ) + self.run_space_check(env0, env1) self.run_align_check(env0, env1, no_time_limit=True) self.run_deterministic_check(Walker2dEnvSpec, Walker2dGymEnvPool) diff --git a/envpool/mujoco/pusher.h b/envpool/mujoco/pusher.h index c121848d..97fab1b8 100644 --- a/envpool/mujoco/pusher.h +++ b/envpool/mujoco/pusher.h @@ -44,11 +44,12 @@ class PusherEnvFns { static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); return MakeDict("obs"_.bind(Spec({23}, {-inf, inf})), - "info:reward_dist"_.bind(Spec({-1})), - "info:reward_ctrl"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed +#ifdef ENVPOOL_TEST "info:qpos0"_.bind(Spec({11})), - "info:qvel0"_.bind(Spec({11}))); + "info:qvel0"_.bind(Spec({11})), +#endif + "info:reward_dist"_.bind(Spec({-1})), + "info:reward_ctrl"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -60,50 +61,43 @@ typedef class EnvSpec PusherEnvSpec; class PusherEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; mjtNum ctrl_cost_weight_, dist_cost_weight_, near_cost_weight_; mjtNum cylinder_dist_min_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_qpos_x_, dist_qpos_y_, dist_qvel_; - bool done_; public: PusherEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/pusher.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), dist_cost_weight_(spec.config["dist_cost_weight"_]), near_cost_weight_(spec.config["near_cost_weight"_]), cylinder_dist_min_(spec.config["cylinder_dist_min"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_qpos_x_(spec.config["cylinder_x_min"_], spec.config["cylinder_x_max"_]), dist_qpos_y_(spec.config["cylinder_y_min"_], spec.config["cylinder_y_max"_]), dist_qvel_(-spec.config["reset_qvel_scale"_], - spec.config["reset_qvel_scale"_]), - done_(true) {} + spec.config["reset_qvel_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq - 4; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i]; + data_->qpos[i] = qpos0_[i] = init_qpos_[i]; } while (1) { mjtNum x = dist_qpos_x_(gen_), y = dist_qpos_y_(gen_); if (std::sqrt(x * x + y * y) > cylinder_dist_min_) { - data_->qpos[model_->nq - 4] = qpos0_.get()[model_->nq - 4] = x; - data_->qpos[model_->nq - 3] = qpos0_.get()[model_->nq - 3] = y; - data_->qpos[model_->nq - 2] = qpos0_.get()[model_->nq - 2] = 0.0; - data_->qpos[model_->nq - 1] = qpos0_.get()[model_->nq - 1] = 0.0; + data_->qpos[model_->nq - 4] = qpos0_[model_->nq - 4] = x; + data_->qpos[model_->nq - 3] = qpos0_[model_->nq - 3] = y; + data_->qpos[model_->nq - 2] = qpos0_[model_->nq - 2] = 0.0; + data_->qpos[model_->nq - 1] = qpos0_[model_->nq - 1] = 0.0; break; } } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = + data_->qvel[i] = qvel0_[i] = i < model_->nv - 4 ? init_qvel_[i] + dist_qvel_(gen_) : 0.0; } } @@ -163,8 +157,10 @@ class PusherEnv : public Env, public MujocoEnv { // info state["info:reward_dist"_] = -dist_cost; state["info:reward_ctrl"_] = -ctrl_cost; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/reacher.h b/envpool/mujoco/reacher.h index 2b236800..567eb4c1 100644 --- a/envpool/mujoco/reacher.h +++ b/envpool/mujoco/reacher.h @@ -42,11 +42,12 @@ class ReacherEnvFns { static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); return MakeDict("obs"_.bind(Spec({11}, {-inf, inf})), - "info:reward_dist"_.bind(Spec({-1})), - "info:reward_ctrl"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed +#ifdef ENVPOOL_TEST "info:qpos0"_.bind(Spec({4})), - "info:qvel0"_.bind(Spec({4}))); + "info:qvel0"_.bind(Spec({4})), +#endif + "info:reward_dist"_.bind(Spec({-1})), + "info:reward_ctrl"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -58,47 +59,40 @@ typedef class EnvSpec ReacherEnvSpec; class ReacherEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; mjtNum ctrl_cost_weight_, dist_cost_weight_; mjtNum reset_goal_scale_, dist_x_, dist_y_, dist_z_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_qpos_, dist_qvel_, dist_goal_; - bool done_; public: ReacherEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/reacher.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), dist_cost_weight_(spec.config["dist_cost_weight"_]), reset_goal_scale_(spec.config["reset_goal_scale"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_qpos_(-spec.config["reset_qpos_scale"_], spec.config["reset_qpos_scale"_]), dist_qvel_(-spec.config["reset_qvel_scale"_], spec.config["reset_qvel_scale"_]), dist_goal_(-spec.config["reset_goal_scale"_], - spec.config["reset_goal_scale"_]), - done_(true) {} + spec.config["reset_goal_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq - 2; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_); } while (1) { mjtNum x = dist_goal_(gen_), y = dist_goal_(gen_); if (std::sqrt(x * x + y * y) < reset_goal_scale_) { - data_->qpos[model_->nq - 2] = qpos0_.get()[model_->nq - 2] = x; - data_->qpos[model_->nq - 1] = qpos0_.get()[model_->nq - 1] = y; + data_->qpos[model_->nq - 2] = qpos0_[model_->nq - 2] = x; + data_->qpos[model_->nq - 1] = qpos0_[model_->nq - 1] = y; break; } } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = + data_->qvel[i] = qvel0_[i] = i < model_->nv - 2 ? init_qvel_[i] + dist_qvel_(gen_) : 0.0; } } @@ -163,8 +157,10 @@ class ReacherEnv : public Env, public MujocoEnv { // info state["info:reward_dist"_] = -dist_cost; state["info:reward_ctrl"_] = -ctrl_cost; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/registration.py b/envpool/mujoco/registration.py index cb02422c..d56f478a 100644 --- a/envpool/mujoco/registration.py +++ b/envpool/mujoco/registration.py @@ -51,7 +51,6 @@ spec_cls=f"{task}EnvSpec", dm_cls=f"{task}DMEnvPool", gym_cls=f"{task}GymEnvPool", - max_episode_steps=1000, base_path=base_path, post_constraint=post_constraint, ) diff --git a/envpool/mujoco/swimmer.h b/envpool/mujoco/swimmer.h index 991ac618..b3d4a70e 100644 --- a/envpool/mujoco/swimmer.h +++ b/envpool/mujoco/swimmer.h @@ -34,23 +34,26 @@ class SwimmerEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "reward_threshold"_.bind(360.0), "frame_skip"_.bind(4), "post_constraint"_.bind(true), + "exclude_current_positions_from_observation"_.bind(true), "forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(1e-4), "reset_noise_scale"_.bind(0.1)); } template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({8}, {-inf, inf})), + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict("obs"_.bind(Spec({no_pos ? 8 : 10}, {-inf, inf})), +#ifdef ENVPOOL_TEST + "info:qpos0"_.bind(Spec({5})), + "info:qvel0"_.bind(Spec({5})), +#endif "info:reward_fwd"_.bind(Spec({-1})), "info:reward_ctrl"_.bind(Spec({-1})), "info:x_position"_.bind(Spec({-1})), "info:y_position"_.bind(Spec({-1})), "info:distance_from_origin"_.bind(Spec({-1})), "info:x_velocity"_.bind(Spec({-1})), - "info:y_velocity"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed - "info:qpos0"_.bind(Spec({5})), - "info:qvel0"_.bind(Spec({5}))); + "info:y_velocity"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -62,33 +65,28 @@ typedef class EnvSpec SwimmerEnvSpec; class SwimmerEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool no_pos_; mjtNum ctrl_cost_weight_, forward_reward_weight_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_; - bool done_; public: SwimmerEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/swimmer.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), forward_reward_weight_(spec.config["forward_reward_weight"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_(-spec.config["reset_noise_scale"_], - spec.config["reset_noise_scale"_]), - done_(true) {} + spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_); } } @@ -131,7 +129,7 @@ class SwimmerEnv : public Env, public MujocoEnv { state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 2; i < model_->nq; ++i) { + for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -146,8 +144,10 @@ class SwimmerEnv : public Env, public MujocoEnv { std::sqrt(x_after * x_after + y_after * y_after); state["info:x_velocity"_] = xv; state["info:y_velocity"_] = yv; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/envpool/mujoco/walker2d.h b/envpool/mujoco/walker2d.h index 5c486787..147aa266 100644 --- a/envpool/mujoco/walker2d.h +++ b/envpool/mujoco/walker2d.h @@ -34,6 +34,8 @@ class Walker2dEnvFns { return MakeDict( "max_episode_steps"_.bind(1000), "frame_skip"_.bind(4), "post_constraint"_.bind(true), "ctrl_cost_weight"_.bind(0.001), + "terminate_when_unhealthy"_.bind(true), + "exclude_current_positions_from_observation"_.bind(true), "forward_reward_weight"_.bind(1.0), "healthy_reward"_.bind(1.0), "healthy_z_min"_.bind(0.8), "healthy_z_max"_.bind(2.0), "healthy_angle_min"_.bind(-1.0), "healthy_angle_max"_.bind(1.0), @@ -43,12 +45,14 @@ class Walker2dEnvFns { template static decltype(auto) StateSpec(const Config& conf) { mjtNum inf = std::numeric_limits::infinity(); - return MakeDict("obs"_.bind(Spec({17}, {-inf, inf})), - "info:x_position"_.bind(Spec({-1})), - "info:x_velocity"_.bind(Spec({-1})), - // TODO(jiayi): remove these two lines for speed + bool no_pos = conf["exclude_current_positions_from_observation"_]; + return MakeDict("obs"_.bind(Spec({no_pos ? 17 : 18}, {-inf, inf})), +#ifdef ENVPOOL_TEST "info:qpos0"_.bind(Spec({9})), - "info:qvel0"_.bind(Spec({9}))); + "info:qvel0"_.bind(Spec({9})), +#endif + "info:x_position"_.bind(Spec({-1})), + "info:x_velocity"_.bind(Spec({-1}))); } template static decltype(auto) ActionSpec(const Config& conf) { @@ -60,22 +64,21 @@ typedef class EnvSpec Walker2dEnvSpec; class Walker2dEnv : public Env, public MujocoEnv { protected: - int max_episode_steps_, elapsed_step_; + bool terminate_when_unhealthy_, no_pos_; mjtNum ctrl_cost_weight_, forward_reward_weight_; mjtNum healthy_reward_, healthy_z_min_, healthy_z_max_; mjtNum healthy_angle_min_, healthy_angle_max_; mjtNum velocity_min_, velocity_max_; - std::unique_ptr qpos0_, qvel0_; // for align check std::uniform_real_distribution<> dist_; - bool done_; public: Walker2dEnv(const Spec& spec, int env_id) : Env(spec, env_id), MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/walker2d.xml", - spec.config["frame_skip"_], spec.config["post_constraint"_]), - max_episode_steps_(spec.config["max_episode_steps"_]), - elapsed_step_(max_episode_steps_ + 1), + spec.config["frame_skip"_], spec.config["post_constraint"_], + spec.config["max_episode_steps"_]), + terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]), + no_pos_(spec.config["exclude_current_positions_from_observation"_]), ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]), forward_reward_weight_(spec.config["forward_reward_weight"_]), healthy_reward_(spec.config["healthy_reward"_]), @@ -85,18 +88,15 @@ class Walker2dEnv : public Env, public MujocoEnv { healthy_angle_max_(spec.config["healthy_angle_max"_]), velocity_min_(spec.config["velocity_min"_]), velocity_max_(spec.config["velocity_max"_]), - qpos0_(new mjtNum[model_->nq]), - qvel0_(new mjtNum[model_->nv]), dist_(-spec.config["reset_noise_scale"_], - spec.config["reset_noise_scale"_]), - done_(true) {} + spec.config["reset_noise_scale"_]) {} void MujocoResetModel() { for (int i = 0; i < model_->nq; ++i) { - data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_(gen_); + data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_(gen_); } for (int i = 0; i < model_->nv; ++i) { - data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_(gen_); + data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_(gen_); } } @@ -125,9 +125,12 @@ class Walker2dEnv : public Env, public MujocoEnv { mjtNum dt = frame_skip_ * model_->opt.timestep; mjtNum xv = (x_after - x_before) / dt; // reward and done - float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost; + mjtNum healthy_reward = + terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0; + float reward = xv * forward_reward_weight_ + healthy_reward - ctrl_cost; ++elapsed_step_; - done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_); + done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) || + (elapsed_step_ >= max_episode_steps_); WriteObs(reward, xv, x_after); } @@ -148,7 +151,7 @@ class Walker2dEnv : public Env, public MujocoEnv { state["reward"_] = reward; // obs mjtNum* obs = static_cast(state["obs"_].data()); - for (int i = 1; i < model_->nq; ++i) { + for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) { *(obs++) = data_->qpos[i]; } for (int i = 0; i < model_->nv; ++i) { @@ -160,8 +163,10 @@ class Walker2dEnv : public Env, public MujocoEnv { // info state["info:x_position"_] = x_after; state["info:x_velocity"_] = xv; - state["info:qpos0"_].Assign(qpos0_.get(), model_->nq); - state["info:qvel0"_].Assign(qvel0_.get(), model_->nv); +#ifdef ENVPOOL_TEST + state["info:qpos0"_].Assign(qpos0_, model_->nq); + state["info:qvel0"_].Assign(qvel0_, model_->nv); +#endif } }; diff --git a/examples/make_env.py b/examples/make_env.py index caf1a13e..660621e2 100644 --- a/examples/make_env.py +++ b/examples/make_env.py @@ -58,6 +58,13 @@ def make_spec() -> None: assert dm_act_spec.num_values, 6 +def check_info_optim() -> None: + env = envpool.make_gym("Ant-v3") + info = env.step(np.array([env.action_space.sample()]), np.array([0]))[-1] + assert "qpos0" not in info and "qvel0" not in info + + if __name__ == "__main__": make_env() make_spec() + check_info_optim()