openai · jsoref · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023 · Jan 24, 2023
diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --
 
 *NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116). 
 
-### Logging and vizualizing learning curves and other training metrics
+### Logging and visualizing learning curves and other training metrics
 By default, all summary data, including progress, standard output, is saved to a unique directory in a temp folder, specified by a call to Python's [tempfile.gettempdir()](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir).
 The directory can be changed with the `--log_path` command-line option.
 ```bash

diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py
@@ -150,7 +150,7 @@ def learn(
     env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
 
 
-    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
+    seed:               seed to make random number sequence in the algorithm reproducible. By default is None which means seed from system noise generator (not reproducible)
 
     nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                         nenv is number of environment copies simulated in parallel)
@@ -159,7 +159,7 @@ def learn(
 
     vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)
 
-    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)
+    ent_coef:           float, coefficient in front of the policy entropy in the total loss function (default: 0.01)
 
     max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)
 

diff --git a/baselines/acer/acer.py b/baselines/acer/acer.py
@@ -96,10 +96,10 @@ def custom_getter(getter, *args, **kwargs):
         with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
             polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)
 
-        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
+        # Notation: (var) = batch variable, (var)s = sequence variable, (var)_i = variable index by action at step i
 
         # action probability distributions according to train_model, polyak_model and step_model
-        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
+        # policy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
         train_model_p = tf.nn.softmax(train_model.pi)
         polyak_model_p = tf.nn.softmax(polyak_model.pi)
         step_model_p = tf.nn.softmax(step_model.pi)
@@ -123,7 +123,7 @@ def custom_getter(getter, *args, **kwargs):
         # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
         entropy = tf.reduce_mean(cat_entropy_softmax(f))
 
-        # Policy Graident loss, with truncated importance sampling & bias correction
+        # Policy Gradient loss, with truncated importance sampling & bias correction
         v = strip(v, nenvs, nsteps, True)
         check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
         check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

diff --git a/baselines/acer/runner.py b/baselines/acer/runner.py
@@ -34,7 +34,7 @@ def run(self):
             mb_mus.append(mus)
             mb_dones.append(self.dones)
             obs, rewards, dones, _ = self.env.step(actions)
-            # states information for statefull models like LSTM
+            # states information for stateful models like LSTM
             self.states = states
             self.dones = dones
             self.obs = obs
@@ -51,7 +51,7 @@ def run(self):
 
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
 
-        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
+        mb_masks = mb_dones # Used for stateful models like LSTM's to mask state when done
         mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
 
         # shapes are now [nenv, nsteps, []]

diff --git a/baselines/acktr/kfac.py b/baselines/acktr/kfac.py
@@ -64,15 +64,15 @@ def getFactors(self, g, varlist):
         fops = []
 
         def searchFactors(gradient, graph):
-            # hard coded search stratergy
+            # hard coded search strategy
             bpropOp = gradient.op
             bpropOp_name = bpropOp.name
 
             bTensors = []
             fTensors = []
 
             # combining additive gradient, assume they are the same op type and
-            # indepedent
+            # independent
             if 'AddN' in bpropOp_name:
                 factors = []
                 for g in gradient.op.inputs:
@@ -134,7 +134,7 @@ def searchFactors(gradient, graph):
 
         ########
         # check associated weights and bias for homogeneous coordinate representation
-        # and check redundent factors
+        # and check redundant factors
         # TO-DO: there may be a bug to detect associate bias and weights for
         # forking layer, e.g. in inception models.
         for param in varlist:
@@ -785,7 +785,7 @@ def getKfacPrecondUpdates(self, gradlist, varlist):
             local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr))
             vg += local_vg
 
-        # recale everything
+        # rescale everything
         if KFAC_DEBUG:
             print('apply vFv clipping')
 

diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py
@@ -70,8 +70,8 @@ def pretty_eta(seconds_left):
     2 hours and 37 minutes
     less than a minute
 
-    Paramters
-    ---------
+    Parameters
+    ----------
     seconds_left: int
         Number of seconds to be converted to the ETA
     Returns
@@ -126,7 +126,7 @@ def update(self, new_val):
         Parameters
         ----------
         new_val: float
-            new observated value of estimated quantity.
+            new observed value of estimated quantity.
         """
         if self._value is None:
             self._value = new_val
@@ -192,7 +192,7 @@ def relatively_safe_pickle_dump(obj, path, compression=False):
         - it is sometimes possible that we end up with useless temp file which needs to be
           deleted manually (it will be removed automatically on the next function call)
 
-    The indended use case is periodic checkpoints of experiment state, such that we never
+    The intended use case is periodic checkpoints of experiment state, such that we never
     corrupt previous checkpoints if the current one fails.
 
     Parameters

diff --git a/baselines/common/mpi_util.py b/baselines/common/mpi_util.py
@@ -39,7 +39,7 @@ def setup_mpi_gpus():
     Set CUDA_VISIBLE_DEVICES to MPI rank if not already set
     """
     if 'CUDA_VISIBLE_DEVICES' not in os.environ:
-        if sys.platform == 'darwin': # This Assumes if you're on OSX you're just
+        if sys.platform == 'darwin': # This Assumes if you're on macOS you're just
             ids = []                 # doing a smoke test and don't want GPUs
         else:
             lrank, _lsize = get_local_rank_size(MPI.COMM_WORLD)

diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py
@@ -36,7 +36,7 @@ def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
 class TfRunningMeanStd(object):
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
     '''
-    TensorFlow variables-based implmentation of computing running mean and std
+    TensorFlow variables-based implementation of computing running mean and std
     Benefit of this implementation is that it can be saved / loaded together with the tensorflow model
     '''
     def __init__(self, epsilon=1e-4, shape=(), scope=''):

diff --git a/baselines/common/schedules.py b/baselines/common/schedules.py
@@ -40,7 +40,7 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=
         """Piecewise schedule.
 
         endpoints: [(int, int)]
-            list of pairs `(time, value)` meanining that schedule should output
+            list of pairs `(time, value)` meaning that schedule should output
             `value` when `t==time`. All the values for time must be sorted in
             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
@@ -51,7 +51,7 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=
             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
             right endpoint that t has covered. See linear_interpolation for example.
         outside_value: float
-            if the value is requested outside of all the intervals sepecified in
+            if the value is requested outside of all the intervals specified in
             `endpoints` this value is returned. If None then AssertionError is
             raised when outside value is requested.
         """

diff --git a/baselines/common/segment_tree.py b/baselines/common/segment_tree.py
@@ -16,8 +16,8 @@ def __init__(self, capacity, operation, neutral_element):
                `reduce` operation which reduces `operation` over
                a contiguous subsequence of items in the array.
 
-        Paramters
-        ---------
+        Parameters
+        ----------
         capacity: int
             Total size of the array - must be a power of two.
         operation: lambda obj, obj -> obj

diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py
@@ -15,7 +15,7 @@ def __init__(
         import filelock
         from tensorflow.examples.tutorials.mnist import input_data
         # we could use temporary directory for this with a context manager and
-        # TemporaryDirecotry, but then each test that uses mnist would re-download the data
+        # TemporaryDirectory, but then each test that uses mnist would re-download the data
         # this way the data is not cleaned up, but we only download it once per machine
         mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
         with filelock.FileLock(mnist_path + '.lock'):

diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py
@@ -38,7 +38,7 @@ def step_env(env, action):
 
 class SubprocVecEnv(VecEnv):
     """
-    VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
+    VecEnv that runs multiple environments in parallel in subprocesses and communicates with them via pipes.
     Recommended to use when num_envs > 1 and step() can be a bottleneck.
     """
     def __init__(self, env_fns, spaces=None, context='spawn', in_series=1):

diff --git a/baselines/deepq/build_graph.py b/baselines/deepq/build_graph.py
@@ -72,7 +72,7 @@
         obs_tp1 gets ignored, but must be of the valid shape.
         dtype must be float32 and shape must be (batch_size,)
     weight: np.array
-        imporance weights for every element of the batch (gradient is multiplied
+        importance weights for every element of the batch (gradient is multiplied
         by the importance weight) dtype must be float32 and shape must be (batch_size,)
 
     Returns
@@ -88,7 +88,7 @@
 
         Q(s,a) - (r + gamma * max_a' Q'(s', a'))
 
-    Where Q' is lagging behind Q to stablize the learning. For example for Atari
+    Where Q' is lagging behind Q to stabilize the learning. For example for Atari
 
     Q' is set to Q once every 10000 updates training steps.
 
@@ -388,7 +388,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
         q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
         q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")
 
-        # target q network evalution
+        # target q network evaluation
         q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
         target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
 

diff --git a/baselines/deepq/replay_buffer.py b/baselines/deepq/replay_buffer.py
@@ -148,7 +148,7 @@ def sample(self, batch_size, beta):
             denoting importance weight of each sampled transition
         idxes: np.array
             Array of shape (batch_size,) and dtype np.int32
-            idexes in buffer of sampled experiences
+            indexes in buffer of sampled experiences
         """
         assert beta > 0
 

diff --git a/baselines/gail/adversary.py b/baselines/gail/adversary.py
@@ -26,7 +26,7 @@ def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="advers
         self.num_actions = env.action_space.shape[0]
         self.hidden_size = hidden_size
         self.build_ph()
-        # Build grpah
+        # Build graph
         generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
         expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
         # Build accuracy

diff --git a/baselines/gail/behavior_clone.py b/baselines/gail/behavior_clone.py
@@ -32,7 +32,7 @@ def argsparser():
     parser.add_argument('--traj_limitation', type=int, default=-1)
     # Network Configuration (Using MLP Policy)
     parser.add_argument('--policy_hidden_size', type=int, default=100)
-    # for evaluatation
+    # for evaluation
     boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
     boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
     parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5)

diff --git a/baselines/gail/gail-eval.py b/baselines/gail/gail-eval.py
@@ -1,5 +1,5 @@
 '''
-This code is used to evalaute the imitators trained with different number of trajectories
+This code is used to evaluate the imitators trained with different number of trajectories
 and plot the results in the same figure for easy comparison.
 '''
 

diff --git a/baselines/gail/result/gail-result.md b/baselines/gail/result/gail-result.md
@@ -24,7 +24,7 @@ Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every i
 
 For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
 
-### Determinstic Policy (Set std=0)
+### Deterministic Policy (Set std=0)
 |   | Un-normalized | Normalized |
 |---|---|---|
 | Hopper-v1 | <img src='Hopper-unnormalized-deterministic-scores.png'> | <img src='Hopper-normalized-deterministic-scores.png'> |
@@ -33,7 +33,7 @@ For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL tr
 | Humanoid-v1 | <img src='Humanoid-unnormalized-deterministic-scores.png'> | <img src='Humanoid-normalized-deterministic-scores.png'> |
 | HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-deterministic-scores.png'> | <img src='HumanoidStandup-normalized-deterministic-scores.png'> |
 
-### Stochatic Policy 
+### Stochastic Policy 
 |   | Un-normalized | Normalized |
 |---|---|---|
 | Hopper-v1 | <img src='Hopper-unnormalized-stochastic-scores.png'> | <img src='Hopper-normalized-stochastic-scores.png'> |

diff --git a/baselines/gail/run_mujoco.py b/baselines/gail/run_mujoco.py
@@ -30,7 +30,7 @@ def argsparser():
     parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None)
     # Task
     parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')
-    # for evaluatation
+    # for evaluation
     boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
     boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
     #  Mujoco Dataset Configuration
@@ -46,7 +46,7 @@ def argsparser():
     parser.add_argument('--max_kl', type=float, default=0.01)
     parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0)
     parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3)
-    # Traing Configuration
+    # Training Configuration
     parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
     parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6)
     # Behavior Cloning

diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py
@@ -248,7 +248,7 @@ def fisher_vector_product(p):
             add_vtarg_and_adv(seg, gamma, lam)
             # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
             ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
-            vpredbefore = seg["vpred"]  # predicted value function before udpate
+            vpredbefore = seg["vpred"]  # predicted value function before update
             atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
 
             if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy

diff --git a/baselines/her/README.md b/baselines/her/README.md
@@ -48,22 +48,22 @@ python experiment/data_generation/fetch_data_generation.py
 ```
 This outputs ```data_fetch_random_100.npz``` file which is our data file.
 
-To launch training with demonstrations (more technically, with behaviour cloning loss as an auxilliary loss), run the following
+To launch training with demonstrations (more technically, with behaviour cloning loss as an auxiliary loss), run the following
 ```bash
 python -m baselines.run --alg=her --env=FetchPickAndPlace-v1 --num_timesteps=2.5e6 --demo_file=/Path/to/demo_file.npz
 ```
 This will train a DDPG+HER agent on the `FetchPickAndPlace` environment by using previously generated demonstration data.
 To inspect what the agent has learned, use the `--play` flag as described above.
 
 #### Configuration
-The provided configuration is for training an agent with HER without demonstrations, we need to change a few paramters for the HER algorithm to learn through demonstrations, to do that, set:
+The provided configuration is for training an agent with HER without demonstrations, we need to change a few parameters for the HER algorithm to learn through demonstrations, to do that, set:
 
-* bc_loss: 1 - whether or not to use the behavior cloning loss as an auxilliary loss
+* bc_loss: 1 - whether or not to use the behavior cloning loss as an auxiliary loss
 * q_filter: 1 - whether or not a Q value filter should be used on the Actor outputs
 * num_demo: 100 - number of expert demo episodes
 * demo_batch_size: 128 - number of samples to be used from the demonstrations buffer, per mpi thread
 * prm_loss_weight: 0.001 - Weight corresponding to the primary loss
-* aux_loss_weight:  0.0078 - Weight corresponding to the auxilliary loss also called the cloning loss
+* aux_loss_weight:  0.0078 - Weight corresponding to the auxiliary loss also called the cloning loss
 
 Apart from these changes the reported results also have the following configurational changes:
 
@@ -80,6 +80,6 @@ Training with demonstrations helps overcome the exploration problem and achieves
 
 <div class="imgcap" align="middle">
 <center><img src="../../data/fetchPickAndPlaceContrast.png"></center>
-<div class="thecap" align="middle"><b>Training results for Fetch Pick and Place task constrasting between training with and without demonstration data.</b></div>
+<div class="thecap" align="middle"><b>Training results for Fetch Pick and Place task contrasting between training with and without demonstration data.</b></div>
 </div>