Skip to content

Commit

Permalink
Align with gym mujoco options (#93)
Browse files Browse the repository at this point in the history
1. simplify code and fix a bug of max_episode_steps
2. add `terminate_when_unhealthy` and `exclude_current_positions_from_observation` for most of mujoco envs
3. add testing macro `ENVPOOL_TEST`
4. eliminate `info["qpos0"]` and `info["qvel0"]` when generating wheel
  • Loading branch information
Trinkle23897 authored Apr 29, 2022
1 parent 79d13c2 commit 8a269d8
Show file tree
Hide file tree
Showing 22 changed files with 388 additions and 264 deletions.
3 changes: 2 additions & 1 deletion .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ build --action_env=BAZEL_LINKLIBS=-l%:libstdc++.a:-lm
build --action_env=BAZEL_LINKOPTS=-static-libgcc
build --incompatible_strict_action_env --cxxopt=-std=c++17 --host_cxxopt=-std=c++17 --client_env=BAZEL_CXXOPTS=-std=c++17
build:release --copt=-g0 --copt=-O3 --copt=-DNDEBUG --copt=-msse --copt=-msse2 --copt=-mmmx
build:debug --compilation_mode=dbg -s
build:debug --cxxopt=-DENVPOOL_TEST --compilation_mode=dbg -s
build:test --cxxopt=-DENVPOOL_TEST

build:clang-tidy --aspects @bazel_clang_tidy//clang_tidy:clang_tidy.bzl%clang_tidy_aspect
build:clang-tidy --@bazel_clang_tidy//:clang_tidy_config=//:clang_tidy_config
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ bazel-build: bazel-install
cp bazel-bin/setup.runfiles/$(PROJECT_NAME)/dist/*.whl ./dist

bazel-test: bazel-install
bazel test --test_output=all $(BAZELOPT) //... --config=release
bazel test --test_output=all $(BAZELOPT) //... --config=release --config=test

bazel-clean: bazel-install
bazel clean --expunge
Expand Down Expand Up @@ -154,7 +154,7 @@ release-test1:
cd envpool && python3 make_test.py

release-test2:
cd examples && python3 env_step.py
cd examples && python3 make_env.py && python3 env_step.py

release-test: release-test1 release-test2

2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_version() -> str:
# -- Project information -----------------------------------------------------

project = "EnvPool"
copyright = "2021, Garena Online Private Limited"
copyright = "2022, Garena Online Private Limited"
author = "EnvPool Contributors"

# The full version, including alpha/beta/rc tags
Expand Down
3 changes: 3 additions & 0 deletions docs/pages/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ integration; however, you don't want to build other stuff such as OpenCV:
.. code-block:: bash
bazel test --test_output=all //envpool/mujoco:mujoco_test --config=release
# or alternatively
cd bazel-bin/envpool/mujoco/mujoco_test.runfiles/envpool/
./envpool/mujoco/mujoco_test
Feel free to customize the command in ``Makefile``!

Expand Down
11 changes: 11 additions & 0 deletions docs/pages/env.rst
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,17 @@ Miscellaneous
has already been defined as ``gen_`` (`link
<https://github.com/sail-sg/envpool/blob/v0.4.0/envpool/core/env.h#L37>`_).
.. note ::
``ENVPOOL_TEST`` is a test-time macro. If you want a piece of C++ code only
available during unit test:
.. code-block:: c++
#ifdef ENVPOOL_TEST
fprintf(stderr, "here");
#endif
Generate Dynamic Linked .so File and Instantiate in Python
----------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions docs/pages/interface.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Interface
=========
Python Interface
================

envpool.make
------------
Expand Down
10 changes: 5 additions & 5 deletions envpool/core/env.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Env {

private:
StateBufferQueue* sbq_;
int order_, elapsed_step_;
int order_, current_step_;
bool is_single_player_;
StateBuffer::WritableSlice slice_;
// for parsing single env action from input action batch
Expand All @@ -60,7 +60,7 @@ class Env {
env_id_(env_id),
seed_(spec.config["seed"_] + env_id),
gen_(seed_),
elapsed_step_(-1),
current_step_(-1),
is_single_player_(max_num_players_ == 1),
action_specs_(spec.action_spec.template values<ShapeSpec>()),
is_player_action_(Transform(action_specs_, [](const ShapeSpec& s) {
Expand Down Expand Up @@ -147,9 +147,9 @@ class Env {
sbq_ = sbq;
order_ = order;
if (reset) {
elapsed_step_ = 0;
current_step_ = 0;
} else {
++elapsed_step_;
++current_step_;
}
}

Expand All @@ -163,7 +163,7 @@ class Env {
State state(&slice_.arr);
state["done"_] = IsDone();
state["info:env_id"_] = env_id_;
state["elapsed_step"_] = elapsed_step_;
state["elapsed_step"_] = current_step_;
int* player_env_id(static_cast<int*>(state["info:players.env_id"_].data()));
for (int i = 0; i < player_num; ++i) {
player_env_id[i] = env_id_;
Expand Down
79 changes: 43 additions & 36 deletions envpool/mujoco/ant.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class AntEnvFns {
return MakeDict(
"max_episode_steps"_.bind(1000), "reward_threshold"_.bind(6000.0),
"frame_skip"_.bind(5), "post_constraint"_.bind(true),
"terminate_when_unhealthy"_.bind(true),
"exclude_current_positions_from_observation"_.bind(true),
"forward_reward_weight"_.bind(1.0), "ctrl_cost_weight"_.bind(0.5),
"contact_cost_weight"_.bind(5e-4), "healthy_reward"_.bind(1.0),
"healthy_z_min"_.bind(0.2), "healthy_z_max"_.bind(1.0),
Expand All @@ -43,19 +45,22 @@ class AntEnvFns {
template <typename Config>
static decltype(auto) StateSpec(const Config& conf) {
mjtNum inf = std::numeric_limits<mjtNum>::infinity();
return MakeDict("obs"_.bind(Spec<mjtNum>({111}, {-inf, inf})),
"info:reward_forward"_.bind(Spec<mjtNum>({-1})),
"info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
"info:reward_contact"_.bind(Spec<mjtNum>({-1})),
"info:reward_survive"_.bind(Spec<mjtNum>({-1})),
"info:x_position"_.bind(Spec<mjtNum>({-1})),
"info:y_position"_.bind(Spec<mjtNum>({-1})),
"info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
"info:x_velocity"_.bind(Spec<mjtNum>({-1})),
"info:y_velocity"_.bind(Spec<mjtNum>({-1})),
// TODO(jiayi): remove these two lines for speed
"info:qpos0"_.bind(Spec<mjtNum>({15})),
"info:qvel0"_.bind(Spec<mjtNum>({14})));
bool no_pos = conf["exclude_current_positions_from_observation"_];
return MakeDict(
"obs"_.bind(Spec<mjtNum>({no_pos ? 111 : 113}, {-inf, inf})),
#ifdef ENVPOOL_TEST
"info:qpos0"_.bind(Spec<mjtNum>({15})),
"info:qvel0"_.bind(Spec<mjtNum>({14})),
#endif
"info:reward_forward"_.bind(Spec<mjtNum>({-1})),
"info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
"info:reward_contact"_.bind(Spec<mjtNum>({-1})),
"info:reward_survive"_.bind(Spec<mjtNum>({-1})),
"info:x_position"_.bind(Spec<mjtNum>({-1})),
"info:y_position"_.bind(Spec<mjtNum>({-1})),
"info:distance_from_origin"_.bind(Spec<mjtNum>({-1})),
"info:x_velocity"_.bind(Spec<mjtNum>({-1})),
"info:y_velocity"_.bind(Spec<mjtNum>({-1})));
}
template <typename Config>
static decltype(auto) ActionSpec(const Config& conf) {
Expand All @@ -67,23 +72,22 @@ typedef class EnvSpec<AntEnvFns> AntEnvSpec;

class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
protected:
int max_episode_steps_, elapsed_step_;
bool terminate_when_unhealthy_, no_pos_;
mjtNum ctrl_cost_weight_, contact_cost_weight_;
mjtNum forward_reward_weight_, healthy_reward_;
mjtNum healthy_z_min_, healthy_z_max_;
mjtNum contact_force_min_, contact_force_max_;
std::unique_ptr<mjtNum> qpos0_, qvel0_; // for align check
std::uniform_real_distribution<> dist_qpos_;
std::normal_distribution<> dist_qvel_;
bool done_;

public:
AntEnv(const Spec& spec, int env_id)
: Env<AntEnvSpec>(spec, env_id),
MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/ant.xml",
spec.config["frame_skip"_], spec.config["post_constraint"_]),
max_episode_steps_(spec.config["max_episode_steps"_]),
elapsed_step_(max_episode_steps_ + 1),
spec.config["frame_skip"_], spec.config["post_constraint"_],
spec.config["max_episode_steps"_]),
terminate_when_unhealthy_(spec.config["terminate_when_unhealthy"_]),
no_pos_(spec.config["exclude_current_positions_from_observation"_]),
ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
contact_cost_weight_(spec.config["contact_cost_weight"_]),
forward_reward_weight_(spec.config["forward_reward_weight"_]),
Expand All @@ -92,19 +96,16 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
healthy_z_max_(spec.config["healthy_z_max"_]),
contact_force_min_(spec.config["contact_force_min"_]),
contact_force_max_(spec.config["contact_force_max"_]),
qpos0_(new mjtNum[model_->nq]),
qvel0_(new mjtNum[model_->nv]),
dist_qpos_(-spec.config["reset_noise_scale"_],
spec.config["reset_noise_scale"_]),
dist_qvel_(0, spec.config["reset_noise_scale"_]),
done_(true) {}
dist_qvel_(0, spec.config["reset_noise_scale"_]) {}

void MujocoResetModel() {
for (int i = 0; i < model_->nq; ++i) {
data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
}
for (int i = 0; i < model_->nv; ++i) {
data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
}
}

Expand All @@ -114,7 +115,7 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
done_ = false;
elapsed_step_ = 0;
MujocoReset();
WriteObs(0.0f, 0, 0, 0, 0, 0, 0);
WriteObs(0.0f, 0, 0, 0, 0, 0, 0, 0);
}

void Step(const Action& action) override {
Expand All @@ -141,13 +142,16 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
x = std::max(contact_force_min_, x);
contact_cost += contact_cost_weight_ * x * x;
}

// reward and done
float reward = xv * forward_reward_weight_ + healthy_reward_ - ctrl_cost -
contact_cost;
mjtNum healthy_reward =
terminate_when_unhealthy_ || IsHealthy() ? healthy_reward_ : 0.0;
float reward =
xv * forward_reward_weight_ + healthy_reward - ctrl_cost - contact_cost;
++elapsed_step_;
done_ = !IsHealthy() || (elapsed_step_ >= max_episode_steps_);
WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after);
done_ = (terminate_when_unhealthy_ ? !IsHealthy() : false) ||
(elapsed_step_ >= max_episode_steps_);
WriteObs(reward, xv, yv, ctrl_cost, contact_cost, x_after, y_after,
healthy_reward);
}

private:
Expand All @@ -169,12 +173,13 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
}

void WriteObs(float reward, mjtNum xv, mjtNum yv, mjtNum ctrl_cost,
mjtNum contact_cost, mjtNum x_after, mjtNum y_after) {
mjtNum contact_cost, mjtNum x_after, mjtNum y_after,
mjtNum healthy_reward) {
State state = Allocate();
state["reward"_] = reward;
// obs
mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
for (int i = 2; i < model_->nq; ++i) {
for (int i = no_pos_ ? 2 : 0; i < model_->nq; ++i) {
*(obs++) = data_->qpos[i];
}
for (int i = 0; i < model_->nv; ++i) {
Expand All @@ -190,15 +195,17 @@ class AntEnv : public Env<AntEnvSpec>, public MujocoEnv {
state["info:reward_forward"_] = xv * forward_reward_weight_;
state["info:reward_ctrl"_] = -ctrl_cost;
state["info:reward_contact"_] = -contact_cost;
state["info:reward_survive"_] = healthy_reward_;
state["info:reward_survive"_] = healthy_reward;
state["info:x_position"_] = x_after;
state["info:y_position"_] = y_after;
state["info:distance_from_origin"_] =
std::sqrt(x_after * x_after + y_after * y_after);
state["info:x_velocity"_] = xv;
state["info:y_velocity"_] = yv;
state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
#ifdef ENVPOOL_TEST
state["info:qpos0"_].Assign(qpos0_, model_->nq);
state["info:qvel0"_].Assign(qvel0_, model_->nv);
#endif
}
};

Expand Down
40 changes: 20 additions & 20 deletions envpool/mujoco/half_cheetah.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,23 @@ class HalfCheetahEnvFns {
return MakeDict(
"max_episode_steps"_.bind(1000), "reward_threshold"_.bind(4800.0),
"frame_skip"_.bind(5), "post_constraint"_.bind(true),
"exclude_current_positions_from_observation"_.bind(true),
"ctrl_cost_weight"_.bind(0.1), "forward_reward_weight"_.bind(1.0),
"reset_noise_scale"_.bind(0.1));
}
template <typename Config>
static decltype(auto) StateSpec(const Config& conf) {
mjtNum inf = std::numeric_limits<mjtNum>::infinity();
return MakeDict("obs"_.bind(Spec<mjtNum>({17}, {-inf, inf})),
bool no_pos = conf["exclude_current_positions_from_observation"_];
return MakeDict("obs"_.bind(Spec<mjtNum>({no_pos ? 17 : 18}, {-inf, inf})),
#ifdef ENVPOOL_TEST
"info:qpos0"_.bind(Spec<mjtNum>({9})),
"info:qvel0"_.bind(Spec<mjtNum>({9})),
#endif
"info:reward_run"_.bind(Spec<mjtNum>({-1})),
"info:reward_ctrl"_.bind(Spec<mjtNum>({-1})),
"info:x_position"_.bind(Spec<mjtNum>({-1})),
"info:x_velocity"_.bind(Spec<mjtNum>({-1})),
// TODO(jiayi): remove these two lines for speed
"info:qpos0"_.bind(Spec<mjtNum>({9})),
"info:qvel0"_.bind(Spec<mjtNum>({9})));
"info:x_velocity"_.bind(Spec<mjtNum>({-1})));
}
template <typename Config>
static decltype(auto) ActionSpec(const Config& conf) {
Expand All @@ -59,35 +62,30 @@ typedef class EnvSpec<HalfCheetahEnvFns> HalfCheetahEnvSpec;

class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
protected:
int max_episode_steps_, elapsed_step_;
bool no_pos_;
mjtNum ctrl_cost_weight_, forward_reward_weight_;
std::unique_ptr<mjtNum> qpos0_, qvel0_; // for align check
std::uniform_real_distribution<> dist_qpos_;
std::normal_distribution<> dist_qvel_;
bool done_;

public:
HalfCheetahEnv(const Spec& spec, int env_id)
: Env<HalfCheetahEnvSpec>(spec, env_id),
MujocoEnv(spec.config["base_path"_] + "/mujoco/assets/half_cheetah.xml",
spec.config["frame_skip"_], spec.config["post_constraint"_]),
max_episode_steps_(spec.config["max_episode_steps"_]),
elapsed_step_(max_episode_steps_ + 1),
spec.config["frame_skip"_], spec.config["post_constraint"_],
spec.config["max_episode_steps"_]),
no_pos_(spec.config["exclude_current_positions_from_observation"_]),
ctrl_cost_weight_(spec.config["ctrl_cost_weight"_]),
forward_reward_weight_(spec.config["forward_reward_weight"_]),
qpos0_(new mjtNum[model_->nq]),
qvel0_(new mjtNum[model_->nv]),
dist_qpos_(-spec.config["reset_noise_scale"_],
spec.config["reset_noise_scale"_]),
dist_qvel_(0, spec.config["reset_noise_scale"_]),
done_(true) {}
dist_qvel_(0, spec.config["reset_noise_scale"_]) {}

void MujocoResetModel() {
for (int i = 0; i < model_->nq; ++i) {
data_->qpos[i] = qpos0_.get()[i] = init_qpos_[i] + dist_qpos_(gen_);
data_->qpos[i] = qpos0_[i] = init_qpos_[i] + dist_qpos_(gen_);
}
for (int i = 0; i < model_->nv; ++i) {
data_->qvel[i] = qvel0_.get()[i] = init_qvel_[i] + dist_qvel_(gen_);
data_->qvel[i] = qvel0_[i] = init_qvel_[i] + dist_qvel_(gen_);
}
}

Expand Down Expand Up @@ -127,7 +125,7 @@ class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
state["reward"_] = reward;
// obs
mjtNum* obs = static_cast<mjtNum*>(state["obs"_].data());
for (int i = 1; i < model_->nq; ++i) {
for (int i = no_pos_ ? 1 : 0; i < model_->nq; ++i) {
*(obs++) = data_->qpos[i];
}
for (int i = 0; i < model_->nv; ++i) {
Expand All @@ -138,8 +136,10 @@ class HalfCheetahEnv : public Env<HalfCheetahEnvSpec>, public MujocoEnv {
state["info:reward_ctrl"_] = -ctrl_cost;
state["info:x_position"_] = x_after;
state["info:x_velocity"_] = xv;
state["info:qpos0"_].Assign(qpos0_.get(), model_->nq);
state["info:qvel0"_].Assign(qvel0_.get(), model_->nv);
#ifdef ENVPOOL_TEST
state["info:qpos0"_].Assign(qpos0_, model_->nq);
state["info:qvel0"_].Assign(qvel0_, model_->nv);
#endif
}
};

Expand Down
Loading

0 comments on commit 8a269d8

Please sign in to comment.