diff --git a/README.md b/README.md index 32680a672e..8cb0172c38 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 -- *NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116). -### Logging and vizualizing learning curves and other training metrics +### Logging and visualizing learning curves and other training metrics By default, all summary data, including progress, standard output, is saved to a unique directory in a temp folder, specified by a call to Python's [tempfile.gettempdir()](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir). The directory can be changed with the `--log_path` command-line option. ```bash diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py index 12d5769e72..ec972ad9cf 100644 --- a/baselines/a2c/a2c.py +++ b/baselines/a2c/a2c.py @@ -150,7 +150,7 @@ def learn( env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) - seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) + seed: seed to make random number sequence in the algorithm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) @@ -159,7 +159,7 @@ def learn( vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) - ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) + ent_coef: float, coefficient in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) diff --git a/baselines/acer/acer.py b/baselines/acer/acer.py index 62764dbc61..222497b371 100644 --- a/baselines/acer/acer.py +++ b/baselines/acer/acer.py @@ -96,10 +96,10 @@ def custom_getter(getter, *args, **kwargs): with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) - # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i + # Notation: (var) = batch variable, (var)s = sequence variable, (var)_i = variable index by action at step i # action probability distributions according to train_model, polyak_model and step_model - # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax + # policy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) @@ -123,7 +123,7 @@ def custom_getter(getter, *args, **kwargs): # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) - # Policy Graident loss, with truncated importance sampling & bias correction + # Policy Gradient loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) diff --git a/baselines/acer/runner.py b/baselines/acer/runner.py index afd19ce2eb..c5625ab6f1 100644 --- a/baselines/acer/runner.py +++ b/baselines/acer/runner.py @@ -34,7 +34,7 @@ def run(self): mb_mus.append(mus) mb_dones.append(self.dones) obs, rewards, dones, _ = self.env.step(actions) - # states information for statefull models like LSTM + # states information for stateful models like LSTM self.states = states self.dones = dones self.obs = obs @@ -51,7 +51,7 @@ def run(self): mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) - mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done + mb_masks = mb_dones # Used for stateful models like LSTM's to mask state when done mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards # shapes are now [nenv, nsteps, []] diff --git a/baselines/acktr/kfac.py b/baselines/acktr/kfac.py index 3d4a8c27ad..9430c03c58 100644 --- a/baselines/acktr/kfac.py +++ b/baselines/acktr/kfac.py @@ -64,7 +64,7 @@ def getFactors(self, g, varlist): fops = [] def searchFactors(gradient, graph): - # hard coded search stratergy + # hard coded search strategy bpropOp = gradient.op bpropOp_name = bpropOp.name @@ -72,7 +72,7 @@ def searchFactors(gradient, graph): fTensors = [] # combining additive gradient, assume they are the same op type and - # indepedent + # independent if 'AddN' in bpropOp_name: factors = [] for g in gradient.op.inputs: @@ -134,7 +134,7 @@ def searchFactors(gradient, graph): ######## # check associated weights and bias for homogeneous coordinate representation - # and check redundent factors + # and check redundant factors # TO-DO: there may be a bug to detect associate bias and weights for # forking layer, e.g. in inception models. for param in varlist: @@ -785,7 +785,7 @@ def getKfacPrecondUpdates(self, gradlist, varlist): local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr)) vg += local_vg - # recale everything + # rescale everything if KFAC_DEBUG: print('apply vFv clipping') diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py index 48bc3dedc5..68d5d76f8c 100644 --- a/baselines/common/misc_util.py +++ b/baselines/common/misc_util.py @@ -70,8 +70,8 @@ def pretty_eta(seconds_left): 2 hours and 37 minutes less than a minute - Paramters - --------- + Parameters + ---------- seconds_left: int Number of seconds to be converted to the ETA Returns @@ -126,7 +126,7 @@ def update(self, new_val): Parameters ---------- new_val: float - new observated value of estimated quantity. + new observed value of estimated quantity. """ if self._value is None: self._value = new_val @@ -192,7 +192,7 @@ def relatively_safe_pickle_dump(obj, path, compression=False): - it is sometimes possible that we end up with useless temp file which needs to be deleted manually (it will be removed automatically on the next function call) - The indended use case is periodic checkpoints of experiment state, such that we never + The intended use case is periodic checkpoints of experiment state, such that we never corrupt previous checkpoints if the current one fails. Parameters diff --git a/baselines/common/mpi_util.py b/baselines/common/mpi_util.py index ca7044e488..2f7f903a6a 100644 --- a/baselines/common/mpi_util.py +++ b/baselines/common/mpi_util.py @@ -39,7 +39,7 @@ def setup_mpi_gpus(): Set CUDA_VISIBLE_DEVICES to MPI rank if not already set """ if 'CUDA_VISIBLE_DEVICES' not in os.environ: - if sys.platform == 'darwin': # This Assumes if you're on OSX you're just + if sys.platform == 'darwin': # This Assumes if you're on macOS you're just ids = [] # doing a smoke test and don't want GPUs else: lrank, _lsize = get_local_rank_size(MPI.COMM_WORLD) diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py index 963a57fbc2..45ad702186 100644 --- a/baselines/common/running_mean_std.py +++ b/baselines/common/running_mean_std.py @@ -36,7 +36,7 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, class TfRunningMeanStd(object): # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm ''' - TensorFlow variables-based implmentation of computing running mean and std + TensorFlow variables-based implementation of computing running mean and std Benefit of this implementation is that it can be saved / loaded together with the tensorflow model ''' def __init__(self, epsilon=1e-4, shape=(), scope=''): diff --git a/baselines/common/schedules.py b/baselines/common/schedules.py index 9dfff50f95..4e4205a7f8 100644 --- a/baselines/common/schedules.py +++ b/baselines/common/schedules.py @@ -40,7 +40,7 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value= """Piecewise schedule. endpoints: [(int, int)] - list of pairs `(time, value)` meanining that schedule should output + list of pairs `(time, value)` meaning that schedule should output `value` when `t==time`. All the values for time must be sorted in an increasing order. When t is between two times, e.g. `(time_a, value_a)` and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs @@ -51,7 +51,7 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value= to the `endpoints`. Alpha is the fraction of distance from left endpoint to right endpoint that t has covered. See linear_interpolation for example. outside_value: float - if the value is requested outside of all the intervals sepecified in + if the value is requested outside of all the intervals specified in `endpoints` this value is returned. If None then AssertionError is raised when outside value is requested. """ diff --git a/baselines/common/segment_tree.py b/baselines/common/segment_tree.py index cb386ecdb5..da921f441b 100644 --- a/baselines/common/segment_tree.py +++ b/baselines/common/segment_tree.py @@ -16,8 +16,8 @@ def __init__(self, capacity, operation, neutral_element): `reduce` operation which reduces `operation` over a contiguous subsequence of items in the array. - Paramters - --------- + Parameters + ---------- capacity: int Total size of the array - must be a power of two. operation: lambda obj, obj -> obj diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py index cc0bde0619..94a73d70da 100644 --- a/baselines/common/tests/envs/mnist_env.py +++ b/baselines/common/tests/envs/mnist_env.py @@ -15,7 +15,7 @@ def __init__( import filelock from tensorflow.examples.tutorials.mnist import input_data # we could use temporary directory for this with a context manager and - # TemporaryDirecotry, but then each test that uses mnist would re-download the data + # TemporaryDirectory, but then each test that uses mnist would re-download the data # this way the data is not cleaned up, but we only download it once per machine mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') with filelock.FileLock(mnist_path + '.lock'): diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py index 8c74000889..47b8a372bd 100644 --- a/baselines/common/vec_env/subproc_vec_env.py +++ b/baselines/common/vec_env/subproc_vec_env.py @@ -38,7 +38,7 @@ def step_env(env, action): class SubprocVecEnv(VecEnv): """ - VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes. + VecEnv that runs multiple environments in parallel in subprocesses and communicates with them via pipes. Recommended to use when num_envs > 1 and step() can be a bottleneck. """ def __init__(self, env_fns, spaces=None, context='spawn', in_series=1): diff --git a/baselines/deepq/build_graph.py b/baselines/deepq/build_graph.py index f7de316b29..2edf52884c 100644 --- a/baselines/deepq/build_graph.py +++ b/baselines/deepq/build_graph.py @@ -72,7 +72,7 @@ obs_tp1 gets ignored, but must be of the valid shape. dtype must be float32 and shape must be (batch_size,) weight: np.array - imporance weights for every element of the batch (gradient is multiplied + importance weights for every element of the batch (gradient is multiplied by the importance weight) dtype must be float32 and shape must be (batch_size,) Returns @@ -88,7 +88,7 @@ Q(s,a) - (r + gamma * max_a' Q'(s', a')) - Where Q' is lagging behind Q to stablize the learning. For example for Atari + Where Q' is lagging behind Q to stabilize the learning. For example for Atari Q' is set to Q once every 10000 updates training steps. @@ -388,7 +388,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping= q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") - # target q network evalution + # target q network evaluation q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") diff --git a/baselines/deepq/replay_buffer.py b/baselines/deepq/replay_buffer.py index 3ddf708601..73ff992144 100644 --- a/baselines/deepq/replay_buffer.py +++ b/baselines/deepq/replay_buffer.py @@ -148,7 +148,7 @@ def sample(self, batch_size, beta): denoting importance weight of each sampled transition idxes: np.array Array of shape (batch_size,) and dtype np.int32 - idexes in buffer of sampled experiences + indexes in buffer of sampled experiences """ assert beta > 0 diff --git a/baselines/gail/adversary.py b/baselines/gail/adversary.py index 96b8a4cb49..98be13322f 100644 --- a/baselines/gail/adversary.py +++ b/baselines/gail/adversary.py @@ -26,7 +26,7 @@ def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="advers self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() - # Build grpah + # Build graph generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy diff --git a/baselines/gail/behavior_clone.py b/baselines/gail/behavior_clone.py index aab1bc6407..cb458efa32 100644 --- a/baselines/gail/behavior_clone.py +++ b/baselines/gail/behavior_clone.py @@ -32,7 +32,7 @@ def argsparser(): parser.add_argument('--traj_limitation', type=int, default=-1) # Network Configuration (Using MLP Policy) parser.add_argument('--policy_hidden_size', type=int, default=100) - # for evaluatation + # for evaluation boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not') parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5) diff --git a/baselines/gail/gail-eval.py b/baselines/gail/gail-eval.py index 1148cb309c..babee28749 100644 --- a/baselines/gail/gail-eval.py +++ b/baselines/gail/gail-eval.py @@ -1,5 +1,5 @@ ''' -This code is used to evalaute the imitators trained with different number of trajectories +This code is used to evaluate the imitators trained with different number of trajectories and plot the results in the same figure for easy comparison. ''' diff --git a/baselines/gail/result/gail-result.md b/baselines/gail/result/gail-result.md index 8ecc9ed1eb..17c5c64646 100644 --- a/baselines/gail/result/gail-result.md +++ b/baselines/gail/result/gail-result.md @@ -24,7 +24,7 @@ Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every i For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing) -### Determinstic Policy (Set std=0) +### Deterministic Policy (Set std=0) | | Un-normalized | Normalized | |---|---|---| | Hopper-v1 | | | @@ -33,7 +33,7 @@ For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL tr | Humanoid-v1 | | | | HumanoidStandup-v1 | | | -### Stochatic Policy +### Stochastic Policy | | Un-normalized | Normalized | |---|---|---| | Hopper-v1 | | | diff --git a/baselines/gail/run_mujoco.py b/baselines/gail/run_mujoco.py index e875c5ccd2..fc021fe829 100644 --- a/baselines/gail/run_mujoco.py +++ b/baselines/gail/run_mujoco.py @@ -30,7 +30,7 @@ def argsparser(): parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None) # Task parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train') - # for evaluatation + # for evaluation boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not') # Mujoco Dataset Configuration @@ -46,7 +46,7 @@ def argsparser(): parser.add_argument('--max_kl', type=float, default=0.01) parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0) parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3) - # Traing Configuration + # Training Configuration parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100) parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6) # Behavior Cloning diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py index 615a4326a7..d97df25588 100644 --- a/baselines/gail/trpo_mpi.py +++ b/baselines/gail/trpo_mpi.py @@ -248,7 +248,7 @@ def fisher_vector_product(p): add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate + vpredbefore = seg["vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy diff --git a/baselines/her/README.md b/baselines/her/README.md index 232b5053bd..a7f39d6aa7 100644 --- a/baselines/her/README.md +++ b/baselines/her/README.md @@ -48,7 +48,7 @@ python experiment/data_generation/fetch_data_generation.py ``` This outputs ```data_fetch_random_100.npz``` file which is our data file. -To launch training with demonstrations (more technically, with behaviour cloning loss as an auxilliary loss), run the following +To launch training with demonstrations (more technically, with behaviour cloning loss as an auxiliary loss), run the following ```bash python -m baselines.run --alg=her --env=FetchPickAndPlace-v1 --num_timesteps=2.5e6 --demo_file=/Path/to/demo_file.npz ``` @@ -56,14 +56,14 @@ This will train a DDPG+HER agent on the `FetchPickAndPlace` environment by using To inspect what the agent has learned, use the `--play` flag as described above. #### Configuration -The provided configuration is for training an agent with HER without demonstrations, we need to change a few paramters for the HER algorithm to learn through demonstrations, to do that, set: +The provided configuration is for training an agent with HER without demonstrations, we need to change a few parameters for the HER algorithm to learn through demonstrations, to do that, set: -* bc_loss: 1 - whether or not to use the behavior cloning loss as an auxilliary loss +* bc_loss: 1 - whether or not to use the behavior cloning loss as an auxiliary loss * q_filter: 1 - whether or not a Q value filter should be used on the Actor outputs * num_demo: 100 - number of expert demo episodes * demo_batch_size: 128 - number of samples to be used from the demonstrations buffer, per mpi thread * prm_loss_weight: 0.001 - Weight corresponding to the primary loss -* aux_loss_weight: 0.0078 - Weight corresponding to the auxilliary loss also called the cloning loss +* aux_loss_weight: 0.0078 - Weight corresponding to the auxiliary loss also called the cloning loss Apart from these changes the reported results also have the following configurational changes: @@ -80,6 +80,6 @@ Training with demonstrations helps overcome the exploration problem and achieves
-
Training results for Fetch Pick and Place task constrasting between training with and without demonstration data.
+
Training results for Fetch Pick and Place task contrasting between training with and without demonstration data.
diff --git a/baselines/her/ddpg.py b/baselines/her/ddpg.py index 988f14b015..9cff557708 100644 --- a/baselines/her/ddpg.py +++ b/baselines/her/ddpg.py @@ -55,12 +55,12 @@ def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polya sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused - bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss - q_filter: whether or not a filter on the q value update should be used when training with demonstartions - num_demo: Number of episodes in to be used in the demonstration buffer + bc_loss: whether or not the behavior cloning loss should be used as an auxiliary loss + q_filter: whether or not a filter on the q value update should be used when training with demonstrations + num_demo: Number of episodes to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss - aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss + aux_loss_weight: Weight corresponding to the auxiliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf @@ -364,7 +364,7 @@ def _create_network(self, reuse=False): self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight - self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight + self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxiliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) diff --git a/baselines/her/experiment/config.py b/baselines/her/experiment/config.py index 6370505a84..839492ae39 100644 --- a/baselines/her/experiment/config.py +++ b/baselines/her/experiment/config.py @@ -46,12 +46,12 @@ 'norm_eps': 0.01, # epsilon used for observation normalization 'norm_clip': 5, # normalized observations are cropped to this values - 'bc_loss': 0, # whether or not to use the behavior cloning loss as an auxilliary loss + 'bc_loss': 0, # whether or not to use the behavior cloning loss as an auxiliary loss 'q_filter': 0, # whether or not a Q value filter should be used on the Actor outputs 'num_demo': 100, # number of expert demo episodes 'demo_batch_size': 128, #number of samples to be used from the demonstrations buffer, per mpi thread 128/1024 or 32/256 'prm_loss_weight': 0.001, #Weight corresponding to the primary loss - 'aux_loss_weight': 0.0078, #Weight corresponding to the auxilliary loss also called the cloning loss + 'aux_loss_weight': 0.0078, #Weight corresponding to the auxiliary loss also called the cloning loss } @@ -84,7 +84,7 @@ def make_env(subrank=None): except ImportError: MPI = None mpi_rank = 0 - logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones publshed in Plappert et al.') + logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones published in Plappert et al.') max_episode_steps = env._max_episode_steps env = Monitor(env, diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py index 7ecd48db58..cd52763e67 100644 --- a/baselines/ppo1/pposgd_simple.py +++ b/baselines/ppo1/pposgd_simple.py @@ -165,7 +165,7 @@ def learn(env, policy_fn, *, # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate + vpredbefore = seg["vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] diff --git a/baselines/results_plotter.py b/baselines/results_plotter.py index 66f09bd12d..273759f575 100644 --- a/baselines/results_plotter.py +++ b/baselines/results_plotter.py @@ -82,8 +82,8 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) parser.add_argument('--num_timesteps', type=int, default=int(10e6)) - parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) - parser.add_argument('--yaxis', help = 'Varible on Y-axis', default = Y_REWARD) + parser.add_argument('--xaxis', help = 'Variable on X-axis', default = X_TIMESTEPS) + parser.add_argument('--yaxis', help = 'Variable on Y-axis', default = Y_REWARD) parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') args = parser.parse_args() args.dirs = [os.path.abspath(dir) for dir in args.dirs] diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py index cd1e7eab77..cb2f602008 100644 --- a/baselines/trpo_mpi/trpo_mpi.py +++ b/baselines/trpo_mpi/trpo_mpi.py @@ -127,7 +127,7 @@ def learn(*, cg_damping conjugate gradient damping - vf_stepsize learning rate for adam optimizer used to optimie value function loss + vf_stepsize learning rate for adam optimizer used to optimize value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step @@ -277,7 +277,7 @@ def allmean(x): rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: - # noththing to be done + # nothing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ @@ -299,7 +299,7 @@ def allmean(x): # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate + vpredbefore = seg["vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)