From c372c0775ca468041f996b239c961fcd56961332 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 4 Apr 2023 22:20:55 -0400 Subject: [PATCH 01/52] refactor network and red reward model --- ding/reward_model/__init__.py | 1 + ding/reward_model/network.py | 48 ++++++++++++++++ ding/reward_model/red_irl_model.py | 91 +++++++++++++----------------- 3 files changed, 87 insertions(+), 53 deletions(-) create mode 100644 ding/reward_model/network.py diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index 4538102861..8ab0dd8903 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -13,3 +13,4 @@ from .guided_cost_reward_model import GuidedCostRewardModel from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel from .icm_reward_model import ICMRewardModel +from .network import FeatureNetwork, RndNetwork, RedNetwork \ No newline at end of file diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py new file mode 100644 index 0000000000..d50807a157 --- /dev/null +++ b/ding/reward_model/network.py @@ -0,0 +1,48 @@ +from typing import Union, Tuple, List, Dict +from easydict import EasyDict + +import torch +import torch.nn as nn + +from ding.utils import SequenceType, REWARD_MODEL_REGISTRY +from ding.model import FCEncoder, ConvEncoder +from ding.torch_utils.data_helper import to_tensor +import numpy as np + +class FeatureNetwork(nn.Module): + def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: + super(FeatureNetwork, self).__init__() + if isinstance(obs_shape, int) or len(obs_shape) == 1: + self.feature = FCEncoder(obs_shape, hidden_size_list) + elif len(obs_shape) == 3: + self.feature = ConvEncoder(obs_shape, hidden_size_list) + else: + raise KeyError( + "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". + format(obs_shape) + ) + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + feature_output = self.feature(obs) + return feature_output + +class RndNetwork(nn.Module): + + def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: + super(RndNetwork, self).__init__() + self.target = FeatureNetwork(obs_shape, hidden_size_list) + self.predictor = FeatureNetwork(obs_shape, hidden_size_list) + + for param in self.target.parameters(): + param.requires_grad = False + + def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + predict_feature = self.predictor(obs) + with torch.no_grad(): + target_feature = self.target(obs) + return predict_feature, target_feature + +class RedNetwork(RndNetwork): + def __init__(self, obs_shape: int, action_shape: int, hidden_size_list: SequenceType) -> None: + # RED network does not support high dimension obs + super().__init__(obs_shape+action_shape, hidden_size_list) diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index a7daeeceec..c36d43f2ec 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -2,29 +2,33 @@ import pickle import random import torch -import torch.nn as nn import torch.optim as optim +import torch.nn.functional as F from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel +from .network import RedNetwork -class SENet(nn.Module): - """support estimation network""" - - def __init__(self, input_size: int, hidden_size: int, output_dims: int) -> None: - super(SENet, self).__init__() - self.l_1 = nn.Linear(input_size, hidden_size) - self.l_2 = nn.Linear(hidden_size, output_dims) - self.act = nn.Tanh() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - out = self.l_1(x) - out = self.act(out) - out = self.l_2(out) - out = self.act(out) - return out - +def concat_state_action_pair(data: list) -> torch.Tensor: + """ + Overview: + Concatenate state and action pairs from input. + Arguments: + - data (:obj:`List`): List with at least ``obs`` and ``action`` keys. + Returns: + - res (:obj:`Torch.tensor`): State and action pairs. + """ + states_data = [] + actions_data = [] + for item in data: + states_data.append(item['obs']) + actions_data.append(item['action']) + states_tensor: torch.Tensor = torch.stack(states_data).float() + actions_tensor: torch.Tensor = torch.stack(actions_data).float() + states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) + + return states_actions_tensor @REWARD_MODEL_REGISTRY.register('red') class RedRewardModel(BaseRewardModel): @@ -67,6 +71,8 @@ class RedRewardModel(BaseRewardModel): sample_size=1000, # (int) Linear model hidden size. hidden_size=128, + # (list(int)) Sequence of ``hidden_size`` of reward network. + hidden_size_list=[128, 1], # (float) The step size of gradient descent. learning_rate=1e-3, # (int) How many updates(iterations) to train after collector's one collection. @@ -99,11 +105,9 @@ def __init__(self, config: Dict, device: str, tb_logger: 'SummaryWriter') -> Non self.device = device assert device in ["cpu", "cuda"] or "cuda" in device self.tb_logger = tb_logger - self.target_net: SENet = SENet(config.input_size, config.hidden_size, 1) - self.online_net: SENet = SENet(config.input_size, config.hidden_size, 1) - self.target_net.to(device) - self.online_net.to(device) - self.opt: optim.Adam = optim.Adam(self.online_net.parameters(), config.learning_rate) + self.reward_model = RedNetwork(config.obs_shape, config.action_shape, config.hidden_size_list) + self.reward_model.to(self.device) + self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.train_once_flag = False self.load_expert_data() @@ -121,19 +125,18 @@ def load_expert_data(self) -> None: self.expert_data = random.sample(self.expert_data, sample_size) print('the expert data size is:', len(self.expert_data)) - def _train(self, batch_data: torch.Tensor) -> float: + def _train(self) -> float: """ Overview: Helper function for ``train`` which caclulates loss for train data and expert data. - Arguments: - - batch_data (:obj:`torch.Tensor`): Data used for training Returns: - - Combined loss calculated of reward model from using ``batch_data`` in both target and reward models. + - Combined loss calculated of reward model from using ``states_actions_tensor``. """ - with torch.no_grad(): - target = self.target_net(batch_data) - hat: torch.Tensor = self.online_net(batch_data) - loss: torch.Tensor = ((hat - target) ** 2).mean() + sample_batch = random.sample(self.expert_data, self.cfg.batch_size) + states_actions_tensor = concat_state_action_pair(sample_batch) + states_actions_tensor = states_actions_tensor.to(self.device) + predict_feature, target_feature = self.reward_model(states_actions_tensor) + loss = F.mse_loss(predict_feature, target_feature.detach()) self.opt.zero_grad() loss.backward() self.opt.step() @@ -150,17 +153,7 @@ def train(self) -> None: one_time_warning('RED model should be trained once, we do not train it anymore') else: for i in range(self.cfg.update_per_collect): - sample_batch = random.sample(self.expert_data, self.cfg.batch_size) - states_data = [] - actions_data = [] - for item in sample_batch: - states_data.append(item['obs']) - actions_data.append(item['action']) - states_tensor: torch.Tensor = torch.stack(states_data).float() - actions_tensor: torch.Tensor = torch.stack(actions_data).float() - states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) - states_actions_tensor = states_actions_tensor.to(self.device) - loss = self._train(states_actions_tensor) + loss = self._train() self.tb_logger.add_scalar('reward_model/red_loss', loss, i) self.train_once_flag = True @@ -177,20 +170,12 @@ def estimate(self, data: list) -> List[Dict]: # NOTE: deepcopy reward part of data is very important, # otherwise the reward of data in the replay buffer will be incorrectly modified. train_data_augmented = self.reward_deepcopy(data) - states_data = [] - actions_data = [] - for item in train_data_augmented: - states_data.append(item['obs']) - actions_data.append(item['action']) - states_tensor = torch.stack(states_data).float() - actions_tensor = torch.stack(actions_data).float() - states_actions_tensor = torch.cat([states_tensor, actions_tensor], dim=1) + states_actions_tensor = concat_state_action_pair(train_data_augmented) states_actions_tensor = states_actions_tensor.to(self.device) with torch.no_grad(): - hat_1 = self.online_net(states_actions_tensor) - hat_2 = self.target_net(states_actions_tensor) - c = ((hat_1 - hat_2) ** 2).mean(dim=1) - r = torch.exp(-self.cfg.sigma * c) + predict_feature, target_feature = self.reward_model(states_actions_tensor) + mse = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) + r = torch.exp(-self.cfg.sigma * mse) for item, rew in zip(train_data_augmented, r): item['reward'] = rew return train_data_augmented From 6718e4a5cd479d007ca8c753cf6938d9b18a9e8e Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 5 Apr 2023 00:22:18 -0400 Subject: [PATCH 02/52] create reward model utils --- .../tests/test_serial_entry_reward_model.py | 5 +- ding/reward_model/__init__.py | 3 +- ding/reward_model/network.py | 11 ++- ding/reward_model/red_irl_model.py | 70 +++++++------------ ding/reward_model/reword_model_utils.py | 37 ++++++++++ 5 files changed, 76 insertions(+), 50 deletions(-) create mode 100644 ding/reward_model/reword_model_utils.py diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 404cb6d78c..ceaf9dc665 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -32,8 +32,9 @@ { 'type': 'red', 'sample_size': 5000, - 'input_size': 5, - 'hidden_size': 64, + 'obs_shape': 4, + 'action_shape': 1, + 'hidden_size_list': [64, 1], 'update_per_collect': 200, 'batch_size': 128, }, diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index 8ab0dd8903..148d791a69 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -13,4 +13,5 @@ from .guided_cost_reward_model import GuidedCostRewardModel from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel from .icm_reward_model import ICMRewardModel -from .network import FeatureNetwork, RndNetwork, RedNetwork \ No newline at end of file +from .network import FeatureNetwork, RndNetwork, RedNetwork +from .reword_model_utils import concat_state_action_pairs \ No newline at end of file diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index d50807a157..91562034b2 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -9,7 +9,9 @@ from ding.torch_utils.data_helper import to_tensor import numpy as np + class FeatureNetwork(nn.Module): + def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: super(FeatureNetwork, self).__init__() if isinstance(obs_shape, int) or len(obs_shape) == 1: @@ -21,11 +23,12 @@ def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: Sequen "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". format(obs_shape) ) - + def forward(self, obs: torch.Tensor) -> torch.Tensor: feature_output = self.feature(obs) return feature_output + class RndNetwork(nn.Module): def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: @@ -41,8 +44,10 @@ def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: with torch.no_grad(): target_feature = self.target(obs) return predict_feature, target_feature - + + class RedNetwork(RndNetwork): + def __init__(self, obs_shape: int, action_shape: int, hidden_size_list: SequenceType) -> None: # RED network does not support high dimension obs - super().__init__(obs_shape+action_shape, hidden_size_list) + super().__init__(obs_shape + action_shape, hidden_size_list) diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index c36d43f2ec..62077810bf 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -1,6 +1,8 @@ from typing import Dict, List import pickle import random +from collections.abc import Iterable + import torch import torch.optim as optim import torch.nn.functional as F @@ -8,28 +10,9 @@ from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel from .network import RedNetwork +from .reword_model_utils import concat_state_action_pairs -def concat_state_action_pair(data: list) -> torch.Tensor: - """ - Overview: - Concatenate state and action pairs from input. - Arguments: - - data (:obj:`List`): List with at least ``obs`` and ``action`` keys. - Returns: - - res (:obj:`Torch.tensor`): State and action pairs. - """ - states_data = [] - actions_data = [] - for item in data: - states_data.append(item['obs']) - actions_data.append(item['action']) - states_tensor: torch.Tensor = torch.stack(states_data).float() - actions_tensor: torch.Tensor = torch.stack(actions_data).float() - states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) - - return states_actions_tensor - @REWARD_MODEL_REGISTRY.register('red') class RedRewardModel(BaseRewardModel): """ @@ -39,38 +22,37 @@ class RedRewardModel(BaseRewardModel): ``estimate``, ``train``, ``load_expert_data``, ``collect_data``, ``clear_date``, \ ``__init__``, ``_train`` Config: - == ================== ===== ============= ======================================= ======================= - ID Symbol Type Default Value Description Other(Shape) - == ================== ===== ============= ======================================= ======================= - 1 ``type`` str red | Reward model register name, refer | - | to registry ``REWARD_MODEL_REGISTRY`` | - 2 | ``expert_data_`` str expert_data | Path to the expert dataset | Should be a '.pkl' - | ``path`` .pkl | | file - 3 | ``sample_size`` int 1000 | sample data from expert dataset | - | with fixed size | - 4 | ``sigma`` int 5 | hyperparameter of r(s,a) | r(s,a) = exp( + == ================== ====== ============= ======================================= ======================= + ID Symbol Type Default Value Description Other(Shape) + == ================== ====== ============= ======================================= ======================= + 1 ``type`` str red | Reward model register name, refer | + | to registry ``REWARD_MODEL_REGISTRY`` | + 2 | ``expert_data_`` str expert_data | Path to the expert dataset | Should be a '.pkl' + | ``path`` .pkl | | file + 3 | ``sample_size`` int 1000 | sample data from expert dataset | + | with fixed size | + 4 | ``sigma`` int 5 | hyperparameter of r(s,a) | r(s,a) = exp( | -sigma* L(s,a)) - 5 | ``batch_size`` int 64 | Training batch size | - 6 | ``hidden_size`` int 128 | Linear model hidden size | - 7 | ``update_per_`` int 100 | Number of updates per collect | - | ``collect`` | | - 8 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay + 5 | ``batch_size`` int 64 | Training batch size | + 6 | ``hidden`` list [64, 64, | Sequence of ``hidden_size`` | + | ``_size_list`` (int) 128] | of reward network | + 7 | ``update_per_`` int 100 | Number of updates per collect | + | ``collect`` | | + 8 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay ``_per_iters`` | buffer's data count | isn't too few. | (code work in entry) - == ================== ===== ============= ======================================= ======================= - Properties: - - online_net (:obj: `SENet`): The reward model, in default initialized once as the training begins. + == ================== ====== ============= ======================================= ======================= """ config = dict( # (str) Reward model register name, refer to registry ``REWARD_MODEL_REGISTRY``. type='red', - # (int) Linear model input size. - # input_size=4, + # (int) observation shape + # obs_shape=4, + # (int) action shape + # action_shape=1, # (int) Sample data from expert dataset with fixed size. sample_size=1000, - # (int) Linear model hidden size. - hidden_size=128, # (list(int)) Sequence of ``hidden_size`` of reward network. hidden_size_list=[128, 1], # (float) The step size of gradient descent. @@ -133,7 +115,7 @@ def _train(self) -> float: - Combined loss calculated of reward model from using ``states_actions_tensor``. """ sample_batch = random.sample(self.expert_data, self.cfg.batch_size) - states_actions_tensor = concat_state_action_pair(sample_batch) + states_actions_tensor = concat_state_action_pairs(sample_batch) states_actions_tensor = states_actions_tensor.to(self.device) predict_feature, target_feature = self.reward_model(states_actions_tensor) loss = F.mse_loss(predict_feature, target_feature.detach()) @@ -170,7 +152,7 @@ def estimate(self, data: list) -> List[Dict]: # NOTE: deepcopy reward part of data is very important, # otherwise the reward of data in the replay buffer will be incorrectly modified. train_data_augmented = self.reward_deepcopy(data) - states_actions_tensor = concat_state_action_pair(train_data_augmented) + states_actions_tensor = concat_state_action_pairs(train_data_augmented) states_actions_tensor = states_actions_tensor.to(self.device) with torch.no_grad(): predict_feature, target_feature = self.reward_model(states_actions_tensor) diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py new file mode 100644 index 0000000000..f3ace38d94 --- /dev/null +++ b/ding/reward_model/reword_model_utils.py @@ -0,0 +1,37 @@ +from typing import Union, Optional, List, Any, Tuple +from collections.abc import Iterable + +import torch +import torch.optim as optim +import torch.nn.functional as F + + +def concat_state_action_pairs( + data: list, action_size: Optional[int] = None, one_hot: Optional[bool] = False +) -> torch.Tensor: + """ + Overview: + Concatenate state and action pairs from input. + Arguments: + - data (:obj:`List`): List with at least ``obs`` and ``action`` keys. + Returns: + - state_actions_tensor (:obj:`Torch.tensor`): State and action pairs. + """ + states_data = [] + actions_data = [] + #check data(dict) has key obs and action + assert isinstance(data, Iterable) + assert "obs" in data[0] and "action" in data[0] + for item in data: + states_data.append(item['obs'].flatten()) # to allow 3d obs and actions concatenation + if one_hot and action_size: + action = torch.Tensor([int(i == item['action']) for i in range(action_size)]) + actions_data.append(action) + else: + actions_data.append(item['action']) + + states_tensor: torch.Tensor = torch.stack(states_data).float() + actions_tensor: torch.Tensor = torch.stack(actions_data).float() + states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) + + return states_actions_tensor From be7039ab9498255c5a9dbf8a4b360a098460ae3b Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 6 Apr 2023 02:14:15 -0400 Subject: [PATCH 03/52] polish network and reward model utils, provide test for them --- ding/reward_model/__init__.py | 4 +-- ding/reward_model/network.py | 21 +++++++----- ding/reward_model/red_irl_model.py | 5 +++ ding/reward_model/reword_model_utils.py | 17 ++++++---- .../tests/test_reward_model_network.py | 25 ++++++++++++++ .../tests/test_reward_model_utils.py | 34 +++++++++++++++++++ 6 files changed, 90 insertions(+), 16 deletions(-) create mode 100644 ding/reward_model/tests/test_reward_model_network.py create mode 100644 ding/reward_model/tests/test_reward_model_utils.py diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index 148d791a69..3c29cbfa57 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -13,5 +13,5 @@ from .guided_cost_reward_model import GuidedCostRewardModel from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel from .icm_reward_model import ICMRewardModel -from .network import FeatureNetwork, RndNetwork, RedNetwork -from .reword_model_utils import concat_state_action_pairs \ No newline at end of file +from .network import RepresentationNetwork, RndNetwork, RedNetwork +from .reword_model_utils import concat_state_action_pairs diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 91562034b2..73b3865d28 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -1,4 +1,4 @@ -from typing import Union, Tuple, List, Dict +from typing import Union, Tuple, List, Dict, Optional from easydict import EasyDict import torch @@ -10,14 +10,19 @@ import numpy as np -class FeatureNetwork(nn.Module): +class RepresentationNetwork(nn.Module): - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: - super(FeatureNetwork, self).__init__() + def __init__( + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + activation: Optional[nn.Module] = nn.ReLU() + ) -> None: + super(RepresentationNetwork, self).__init__() if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.feature = FCEncoder(obs_shape, hidden_size_list) + self.feature = FCEncoder(obs_shape, hidden_size_list, activation=activation) elif len(obs_shape) == 3: - self.feature = ConvEncoder(obs_shape, hidden_size_list) + self.feature = ConvEncoder(obs_shape, hidden_size_list, activation=activation) else: raise KeyError( "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". @@ -33,8 +38,8 @@ class RndNetwork(nn.Module): def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: super(RndNetwork, self).__init__() - self.target = FeatureNetwork(obs_shape, hidden_size_list) - self.predictor = FeatureNetwork(obs_shape, hidden_size_list) + self.target = RepresentationNetwork(obs_shape, hidden_size_list) + self.predictor = RepresentationNetwork(obs_shape, hidden_size_list) for param in self.target.parameters(): param.requires_grad = False diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index 62077810bf..9aaf839c67 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -10,6 +10,7 @@ from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel from .network import RedNetwork +from .gail_irl_model import concat_state_action_pairs_one_hot from .reword_model_utils import concat_state_action_pairs @@ -116,6 +117,10 @@ def _train(self) -> float: """ sample_batch = random.sample(self.expert_data, self.cfg.batch_size) states_actions_tensor = concat_state_action_pairs(sample_batch) + states_actions_tensor_one = concat_state_action_pairs(sample_batch, action_size=5, one_hot_=True) + states_actions_tensor_one_hot = concat_state_action_pairs_one_hot(sample_batch, 5) + states_actions_tensor_one_hot = torch.stack(states_actions_tensor_one_hot) + assert states_actions_tensor_one.equal(states_actions_tensor_one_hot) states_actions_tensor = states_actions_tensor.to(self.device) predict_feature, target_feature = self.reward_model(states_actions_tensor) loss = F.mse_loss(predict_feature, target_feature.detach()) diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index f3ace38d94..d89cfb651e 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -5,9 +5,11 @@ import torch.optim as optim import torch.nn.functional as F +from ding.torch_utils import one_hot + def concat_state_action_pairs( - data: list, action_size: Optional[int] = None, one_hot: Optional[bool] = False + data: list, action_size: Optional[int] = None, one_hot_: Optional[bool] = False ) -> torch.Tensor: """ Overview: @@ -20,12 +22,15 @@ def concat_state_action_pairs( states_data = [] actions_data = [] #check data(dict) has key obs and action - assert isinstance(data, Iterable) - assert "obs" in data[0] and "action" in data[0] + assert isinstance(data, Iterable), "data should be Iterable" + assert "obs" in data[0] and "action" in data[0], "data member must contain key 'obs' and 'action' " for item in data: - states_data.append(item['obs'].flatten()) # to allow 3d obs and actions concatenation - if one_hot and action_size: - action = torch.Tensor([int(i == item['action']) for i in range(action_size)]) + states_data.append(item['obs'].flatten()) + if one_hot_ and action_size: + new_action = torch.Tensor([int(i == item['action']) for i in range(action_size)]) + print(new_action.shape) + action = one_hot(torch.Tensor(item['action']), action_size).squeeze(dim=0) + print(action.shape) actions_data.append(action) else: actions_data.append(item['action']) diff --git a/ding/reward_model/tests/test_reward_model_network.py b/ding/reward_model/tests/test_reward_model_network.py new file mode 100644 index 0000000000..dbc06c7ef0 --- /dev/null +++ b/ding/reward_model/tests/test_reward_model_network.py @@ -0,0 +1,25 @@ +from collections.abc import Iterable + +import torch +import pytest +import torch.optim as optim +import torch.nn.functional as F + +from ding.reward_model.network import RepresentationNetwork, RndNetwork, RedNetwork + + +@pytest.mark.unittest +def test_representation_network(): + # len(obs_shape) == 3 + obs_shape = [4, 84, 84] + batch_size = 32 + hidden_size_list = [16, 16, 16, 16] + reward_model = RepresentationNetwork(obs_shape, hidden_size_list) + data = torch.randn([batch_size] + obs_shape) + data_feature = reward_model(data) + assert data_feature.shape == (batch_size, hidden_size_list[-1]) + + # len(obs_shape) == 4 + with pytest.raises(KeyError): + obs_shape = [4, 84, 84, 5] + reward_model = RepresentationNetwork(obs_shape, hidden_size_list) diff --git a/ding/reward_model/tests/test_reward_model_utils.py b/ding/reward_model/tests/test_reward_model_utils.py new file mode 100644 index 0000000000..7b445fb527 --- /dev/null +++ b/ding/reward_model/tests/test_reward_model_utils.py @@ -0,0 +1,34 @@ +import pytest +import torch + +from ding.reward_model.reword_model_utils import concat_state_action_pairs + + +@pytest.mark.unittest +def test_concat_state_action_pairs(): + data = [{'obs': torch.rand(3), 'action': torch.randint(0, 4, size=(1, ))} for i in range(10)] + states_actions_tensor = concat_state_action_pairs(data) + states_actions_test = [] + for item in data: + state = item['obs'].flatten() + action = item['action'] + s_a = torch.cat([state, action.float()], dim=-1) + states_actions_test.append(s_a) + states_actions_tensor_test = torch.stack(states_actions_test) + assert states_actions_tensor.equal(states_actions_tensor_test) + + +@pytest.mark.unittest +def test_concat_state_action_pairs_one_hot(): + data = [{'obs': torch.rand(3), 'action': torch.randint(0, 4, size=(1, ))} for i in range(10)] + action_size = 5 + states_actions_tensor = concat_state_action_pairs(data, action_size, True) + states_actions_test = [] + for item in data: + state = item['obs'].flatten() + action = item['action'] + action = torch.Tensor([int(i == action) for i in range(action_size)]) + s_a = torch.cat([state, action], dim=-1) + states_actions_test.append(s_a) + states_actions_tensor_test = torch.stack(states_actions_test) + assert states_actions_tensor.equal(states_actions_tensor_test) From a4de46659a0c7a2cd412d0e7676b36d886a5b335 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 10 Apr 2023 01:36:23 -0400 Subject: [PATCH 04/52] refactor network for two method: learn and forward --- ding/reward_model/network.py | 33 ++++++++++++++++++++++++++---- ding/reward_model/red_irl_model.py | 15 ++++---------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 73b3865d28..0b74820b09 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from ding.utils import SequenceType, REWARD_MODEL_REGISTRY from ding.model import FCEncoder, ConvEncoder @@ -25,7 +26,7 @@ def __init__( self.feature = ConvEncoder(obs_shape, hidden_size_list, activation=activation) else: raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". + "not support obs_shape for pre-defined encoder: {}, please customize your own Representation Network". format(obs_shape) ) @@ -44,15 +45,39 @@ def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: Sequen for param in self.target.parameters(): param.requires_grad = False - def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, obs: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + predict_feature = self.predictor(obs) + target_feature = self.target(obs) + reward = F.mse_loss(predict_feature, target_feature.detach()) + reward = (reward - reward.min()) / (reward.max() - reward.min() + 1e-8) + return reward + + def learn(self, obs: torch.Tensor) -> torch.Tensor: predict_feature = self.predictor(obs) with torch.no_grad(): target_feature = self.target(obs) - return predict_feature, target_feature + loss = F.mse_loss(predict_feature, target_feature.detach()) + return loss class RedNetwork(RndNetwork): - def __init__(self, obs_shape: int, action_shape: int, hidden_size_list: SequenceType) -> None: + def __init__( + self, + obs_shape: int, + action_shape: int, + hidden_size_list: SequenceType, + sigma: Optional[float] = 0.5 + ) -> None: # RED network does not support high dimension obs super().__init__(obs_shape + action_shape, hidden_size_list) + self.sigma = sigma + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + predict_feature = self.predictor(obs) + target_feature = self.target(obs) + reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) + reward = torch.exp(-self.sigma * reward) + return reward diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index 9aaf839c67..992812111a 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -88,7 +88,7 @@ def __init__(self, config: Dict, device: str, tb_logger: 'SummaryWriter') -> Non self.device = device assert device in ["cpu", "cuda"] or "cuda" in device self.tb_logger = tb_logger - self.reward_model = RedNetwork(config.obs_shape, config.action_shape, config.hidden_size_list) + self.reward_model = RedNetwork(config.obs_shape, config.action_shape, config.hidden_size_list, config.sigma) self.reward_model.to(self.device) self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.train_once_flag = False @@ -117,13 +117,9 @@ def _train(self) -> float: """ sample_batch = random.sample(self.expert_data, self.cfg.batch_size) states_actions_tensor = concat_state_action_pairs(sample_batch) - states_actions_tensor_one = concat_state_action_pairs(sample_batch, action_size=5, one_hot_=True) - states_actions_tensor_one_hot = concat_state_action_pairs_one_hot(sample_batch, 5) - states_actions_tensor_one_hot = torch.stack(states_actions_tensor_one_hot) - assert states_actions_tensor_one.equal(states_actions_tensor_one_hot) states_actions_tensor = states_actions_tensor.to(self.device) - predict_feature, target_feature = self.reward_model(states_actions_tensor) - loss = F.mse_loss(predict_feature, target_feature.detach()) + loss = self.reward_model.learn(states_actions_tensor) + # loss = F.mse_loss(predict_feature, target_feature.detach()) self.opt.zero_grad() loss.backward() self.opt.step() @@ -159,10 +155,7 @@ def estimate(self, data: list) -> List[Dict]: train_data_augmented = self.reward_deepcopy(data) states_actions_tensor = concat_state_action_pairs(train_data_augmented) states_actions_tensor = states_actions_tensor.to(self.device) - with torch.no_grad(): - predict_feature, target_feature = self.reward_model(states_actions_tensor) - mse = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) - r = torch.exp(-self.cfg.sigma * mse) + r = self.reward_model.forward(states_actions_tensor) for item, rew in zip(train_data_augmented, r): item['reward'] = rew return train_data_augmented From 7a8ec6eae46784c85966fd167f900b134e4fc9c3 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 10 Apr 2023 23:04:00 -0400 Subject: [PATCH 05/52] refactor rnd --- ding/reward_model/network.py | 2 +- ding/reward_model/reword_model_utils.py | 43 ++++++++++++ ding/reward_model/rnd_reward_model.py | 89 ++++--------------------- 3 files changed, 58 insertions(+), 76 deletions(-) diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 0b74820b09..b2695bc8cc 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -49,7 +49,7 @@ def forward(self, obs: torch.Tensor) -> torch.Tensor: with torch.no_grad(): predict_feature = self.predictor(obs) target_feature = self.target(obs) - reward = F.mse_loss(predict_feature, target_feature.detach()) + reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) reward = (reward - reward.min()) / (reward.max() - reward.min() + 1e-8) return reward diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index d89cfb651e..585bb21f34 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -1,11 +1,14 @@ from typing import Union, Optional, List, Any, Tuple from collections.abc import Iterable +from easydict import EasyDict import torch import torch.optim as optim import torch.nn.functional as F from ding.torch_utils import one_hot +from ding.utils import RunningMeanStd +from ding.torch_utils.data_helper import to_tensor def concat_state_action_pairs( @@ -40,3 +43,43 @@ def concat_state_action_pairs( states_actions_tensor: torch.Tensor = torch.cat([states_tensor, actions_tensor], dim=1) return states_actions_tensor + + +def combine_intrinsic_exterinsic_reward( + train_data_augmented: Any, rnd_reward: List[torch.Tensor], config: EasyDict +) -> Any: + for item, rnd_rew in zip(train_data_augmented, rnd_reward): + if config.intrinsic_reward_type == 'add': + if config.extrinsic_reward_norm: + item['reward' + ] = item['reward'] / config.extrinsic_reward_norm_max + rnd_rew * config.intrinsic_reward_weight + else: + item['reward'] = item['reward'] + rnd_rew * config.intrinsic_reward_weight + elif config.intrinsic_reward_type == 'new': + item['intrinsic_reward'] = rnd_rew + if config.extrinsic_reward_norm: + item['reward'] = item['reward'] / config.extrinsic_reward_norm_max + elif config.intrinsic_reward_type == 'assign': + item['reward'] = rnd_rew + + return item + + +def collect_states(iterator) -> List: + res = [] + for item in iterator: + state = item['obs'] + res.append(state) + return res + + +def obs_norm( + train_data: torch.Tensor, running_mean_std_rnd_obs: RunningMeanStd, config: EasyDict, device: str +) -> torch.Tensor: + # Note: observation normalization: transform obs to mean 0, std 1, move norm obs to specific device + running_mean_std_rnd_obs.update(train_data.cpu().numpy()) + train_data = (train_data - to_tensor(running_mean_std_rnd_obs.mean + ).to(device)) / to_tensor(running_mean_std_rnd_obs.std).to(device) + train_data = torch.clamp(train_data, min=config.obs_norm_clamp_min, max=config.obs_norm_clamp_max) + + return train_data diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 00bb1542fd..7f7f320ad1 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -1,53 +1,19 @@ -from typing import Union, Tuple, List, Dict +from typing import List, Dict from easydict import EasyDict import random import torch -import torch.nn as nn import torch.optim as optim -import torch.nn.functional as F -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY -from ding.model import FCEncoder, ConvEncoder +from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .reword_model_utils import combine_intrinsic_exterinsic_reward, collect_states, obs_norm +from .network import RndNetwork from ding.utils import RunningMeanStd from ding.torch_utils.data_helper import to_tensor import numpy as np -def collect_states(iterator): - res = [] - for item in iterator: - state = item['obs'] - res.append(state) - return res - - -class RndNetwork(nn.Module): - - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: - super(RndNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.target = FCEncoder(obs_shape, hidden_size_list) - self.predictor = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.target = ConvEncoder(obs_shape, hidden_size_list) - self.predictor = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". - format(obs_shape) - ) - for param in self.target.parameters(): - param.requires_grad = False - - def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - predict_feature = self.predictor(obs) - with torch.no_grad(): - target_feature = self.target(obs) - return predict_feature, target_feature - - @REWARD_MODEL_REGISTRY.register('rnd') class RndRewardModel(BaseRewardModel): """ @@ -132,32 +98,26 @@ def __init__(self, config: EasyDict, device: str = 'cpu', tb_logger: 'SummaryWri assert self.intrinsic_reward_type in ['add', 'new', 'assign'] self.train_obs = [] self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) - self._running_mean_std_rnd_reward = RunningMeanStd(epsilon=1e-4) self.estimate_cnt_rnd = 0 self.train_cnt_icm = 0 self._running_mean_std_rnd_obs = RunningMeanStd(epsilon=1e-4) - def _train(self) -> None: + def _train(self) -> torch.Tensor: train_data: list = random.sample(self.train_obs, self.cfg.batch_size) train_data: torch.Tensor = torch.stack(train_data).to(self.device) if self.cfg.obs_norm: - # Note: observation normalization: transform obs to mean 0, std 1 - self._running_mean_std_rnd_obs.update(train_data.cpu().numpy()) - train_data = (train_data - to_tensor(self._running_mean_std_rnd_obs.mean).to(self.device)) / to_tensor( - self._running_mean_std_rnd_obs.std - ).to(self.device) - train_data = torch.clamp(train_data, min=self.cfg.obs_norm_clamp_min, max=self.cfg.obs_norm_clamp_max) - - predict_feature, target_feature = self.reward_model(train_data) - loss = F.mse_loss(predict_feature, target_feature.detach()) - self.tb_logger.add_scalar('rnd_reward/loss', loss, self.train_cnt_icm) + train_data = obs_norm(train_data, self._running_mean_std_rnd_obs, self.cfg, self.device) + loss = self.reward_model.learn(train_data) self.opt.zero_grad() loss.backward() self.opt.step() + return loss + def train(self) -> None: for _ in range(self.cfg.update_per_collect): - self._train() + loss = self._train() + self.tb_logger.add_scalar('rnd_reward/loss', loss, self.train_cnt_icm) self.train_cnt_icm += 1 def estimate(self, data: list) -> List[Dict]: @@ -171,19 +131,10 @@ def estimate(self, data: list) -> List[Dict]: obs = collect_states(train_data_augmented) obs = torch.stack(obs).to(self.device) if self.cfg.obs_norm: - # Note: observation normalization: transform obs to mean 0, std 1 - obs = (obs - to_tensor(self._running_mean_std_rnd_obs.mean - ).to(self.device)) / to_tensor(self._running_mean_std_rnd_obs.std).to(self.device) - obs = torch.clamp(obs, min=self.cfg.obs_norm_clamp_min, max=self.cfg.obs_norm_clamp_max) + obs = obs_norm(obs, self._running_mean_std_rnd_obs, self.cfg, self.device) with torch.no_grad(): - predict_feature, target_feature = self.reward_model(obs) - mse = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) - self._running_mean_std_rnd_reward.update(mse.cpu().numpy()) - - # Note: according to the min-max normalization, transform rnd reward to [0,1] - rnd_reward = (mse - mse.min()) / (mse.max() - mse.min() + 1e-8) - + rnd_reward = self.reward_model.forward(obs) # save the rnd_reward statistics into tb_logger self.estimate_cnt_rnd += 1 self.tb_logger.add_scalar('rnd_reward/rnd_reward_max', rnd_reward.max(), self.estimate_cnt_rnd) @@ -200,19 +151,7 @@ def estimate(self, data: list) -> List[Dict]: # rewards = torch.stack([data[i]['reward'] for i in range(len(data))]) # rewards = (rewards - torch.min(rewards)) / (torch.max(rewards) - torch.min(rewards)) - for item, rnd_rew in zip(train_data_augmented, rnd_reward): - if self.intrinsic_reward_type == 'add': - if self.cfg.extrinsic_reward_norm: - item['reward'] = item[ - 'reward'] / self.cfg.extrinsic_reward_norm_max + rnd_rew * self.cfg.intrinsic_reward_weight - else: - item['reward'] = item['reward'] + rnd_rew * self.cfg.intrinsic_reward_weight - elif self.intrinsic_reward_type == 'new': - item['intrinsic_reward'] = rnd_rew - if self.cfg.extrinsic_reward_norm: - item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max - elif self.intrinsic_reward_type == 'assign': - item['reward'] = rnd_rew + item = combine_intrinsic_exterinsic_reward(train_data_augmented, rnd_reward, self.cfg) # save the augmented_reward statistics into tb_logger rew = [item['reward'].cpu().numpy() for item in train_data_augmented] From 55c7be81c5141073e354133103665dbef501a0fd Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 11 Apr 2023 15:35:27 -0400 Subject: [PATCH 06/52] refactor gail --- ding/reward_model/gail_irl_model.py | 137 +++++------------------- ding/reward_model/network.py | 70 ++++++++++-- ding/reward_model/red_irl_model.py | 1 - ding/reward_model/reword_model_utils.py | 3 - ding/reward_model/trex_reward_model.py | 2 +- 5 files changed, 89 insertions(+), 124 deletions(-) diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index 6533e114dd..f650c6645d 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -10,99 +10,12 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .reword_model_utils import concat_state_action_pairs +from .network import GailNetwork import torch.nn.functional as F from functools import partial -def concat_state_action_pairs(iterator): - """ - Overview: - Concatenate state and action pairs from input. - Arguments: - - iterator (:obj:`Iterable`): Iterables with at least ``obs`` and ``action`` tensor keys. - Returns: - - res (:obj:`Torch.tensor`): State and action pairs. - """ - assert isinstance(iterator, Iterable) - res = [] - for item in iterator: - state = item['obs'].flatten() # to allow 3d obs and actions concatenation - action = item['action'] - s_a = torch.cat([state, action.float()], dim=-1) - res.append(s_a) - return res - - -def concat_state_action_pairs_one_hot(iterator, action_size: int): - """ - Overview: - Concatenate state and action pairs from input. Action values are one-hot encoded - Arguments: - - iterator (:obj:`Iterable`): Iterables with at least ``obs`` and ``action`` tensor keys. - Returns: - - res (:obj:`Torch.tensor`): State and action pairs. - """ - assert isinstance(iterator, Iterable) - res = [] - for item in iterator: - state = item['obs'].flatten() # to allow 3d obs and actions concatenation - action = item['action'] - action = torch.Tensor([int(i == action) for i in range(action_size)]) - s_a = torch.cat([state, action], dim=-1) - res.append(s_a) - return res - - -class RewardModelNetwork(nn.Module): - - def __init__(self, input_size: int, hidden_size: int, output_size: int) -> None: - super(RewardModelNetwork, self).__init__() - self.l1 = nn.Linear(input_size, hidden_size) - self.l2 = nn.Linear(hidden_size, output_size) - self.a1 = nn.Tanh() - self.a2 = nn.Sigmoid() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - out = x - out = self.l1(out) - out = self.a1(out) - out = self.l2(out) - out = self.a2(out) - return out - - -class AtariRewardModelNetwork(nn.Module): - - def __init__(self, input_size: int, action_size: int) -> None: - super(AtariRewardModelNetwork, self).__init__() - self.input_size = input_size - self.action_size = action_size - self.conv1 = nn.Conv2d(4, 16, 7, stride=3) - self.conv2 = nn.Conv2d(16, 16, 5, stride=2) - self.conv3 = nn.Conv2d(16, 16, 3, stride=1) - self.conv4 = nn.Conv2d(16, 16, 3, stride=1) - self.fc1 = nn.Linear(784, 64) - self.fc2 = nn.Linear(64 + self.action_size, 1) # here we add 1 to take consideration of the action concat - self.a = nn.Sigmoid() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - # input: x = [B, 4 x 84 x 84 + self.action_size], last element is action - actions = x[:, -self.action_size:] # [B, self.action_size] - # get observations - x = x[:, :-self.action_size] - x = x.reshape([-1] + self.input_size) # [B, 4, 84, 84] - x = F.leaky_relu(self.conv1(x)) - x = F.leaky_relu(self.conv2(x)) - x = F.leaky_relu(self.conv3(x)) - x = F.leaky_relu(self.conv4(x)) - x = x.reshape(-1, 784) - x = F.leaky_relu(self.fc1(x)) - x = torch.cat([x, actions], dim=-1) - x = self.fc2(x) - r = self.a(x) - return r - - @REWARD_MODEL_REGISTRY.register('gail') class GailRewardModel(BaseRewardModel): """ @@ -175,12 +88,14 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.tb_logger = tb_logger obs_shape = config.input_size if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.reward_model = RewardModelNetwork(config.input_size, config.hidden_size, 1) + self.reward_model = GailNetwork(obs_shape, [config.hidden_size], nn.Tanh()) self.concat_state_action_pairs = concat_state_action_pairs elif len(obs_shape) == 3: action_shape = self.cfg.action_size - self.reward_model = AtariRewardModelNetwork(config.input_size, action_shape) - self.concat_state_action_pairs = partial(concat_state_action_pairs_one_hot, action_size=action_shape) + self.reward_model = GailNetwork( + obs_shape, [16, 16, 16, 16, 64], [7, 5, 3, 3], [3, 2, 2, 1], nn.LeakyReLU(), action_shape + ) + self.concat_state_action_pairs = partial(concat_state_action_pairs, action_size=action_shape, one_hot_=True) self.reward_model.to(self.device) self.expert_data = [] self.train_data = [] @@ -201,6 +116,7 @@ def load_expert_data(self) -> None: with open(self.cfg.data_path + '/expert_data.pkl', 'rb') as f: self.expert_data_loader: list = pickle.load(f) self.expert_data = self.concat_state_action_pairs(self.expert_data_loader) + self.expert_data = torch.unbind(self.expert_data, dim=0) def state_dict(self) -> Dict[str, Any]: return { @@ -210,23 +126,20 @@ def state_dict(self) -> Dict[str, Any]: def load_state_dict(self, state_dict: Dict[str, Any]) -> None: self.reward_model.load_state_dict(state_dict['model']) - def learn(self, train_data: torch.Tensor, expert_data: torch.Tensor) -> float: + def _train(self) -> float: """ Overview: - Helper function for ``train`` which calculates loss for train data and expert data. - Arguments: - - train_data (:obj:`torch.Tensor`): Data used for training - - expert_data (:obj:`torch.Tensor`): Expert data + Helper function for ``train`` which caclulates loss for train data and expert data. Returns: - - Combined loss calculated of reward model from using ``train_data`` and ``expert_data``. + - Combined loss calculated of reward model from using ``states_actions_tensor``. """ - # calculate loss, here are some hyper-param - out_1: torch.Tensor = self.reward_model(train_data) - loss_1: torch.Tensor = torch.log(out_1 + 1e-8).mean() - out_2: torch.Tensor = self.reward_model(expert_data) - loss_2: torch.Tensor = torch.log(1 - out_2 + 1e-8).mean() - # log(x) with 0 None: - This is a side effect function which updates the reward model and increment the train iteration count. """ for _ in range(self.cfg.update_per_collect): - sample_expert_data: list = random.sample(self.expert_data, self.cfg.batch_size) - sample_train_data: list = random.sample(self.train_data, self.cfg.batch_size) - sample_expert_data = torch.stack(sample_expert_data).to(self.device) - sample_train_data = torch.stack(sample_train_data).to(self.device) - loss = self.learn(sample_train_data, sample_expert_data) + loss = self._train() self.tb_logger.add_scalar('reward_model/gail_loss', loss, self.train_iter) self.train_iter += 1 @@ -264,9 +173,9 @@ def estimate(self, data: list) -> List[Dict]: # otherwise the reward of data in the replay buffer will be incorrectly modified. train_data_augmented = self.reward_deepcopy(data) res = self.concat_state_action_pairs(train_data_augmented) - res = torch.stack(res).to(self.device) + res = res.to(self.device) with torch.no_grad(): - reward = self.reward_model(res).squeeze(-1).cpu() + reward = self.reward_model.forward(res).squeeze(-1).cpu() reward = torch.chunk(reward, reward.shape[0], dim=0) for item, rew in zip(train_data_augmented, reward): item['reward'] = -torch.log(rew + 1e-8) @@ -282,7 +191,9 @@ def collect_data(self, data: list) -> None: Effects: - This is a side effect function which updates the data attribute in ``self`` """ - self.train_data.extend(self.concat_state_action_pairs(data)) + data = self.concat_state_action_pairs(data) + data = torch.unbind(data, dim=0) + self.train_data.extend(data) def clear_data(self) -> None: """ diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index b2695bc8cc..53513df870 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -8,22 +8,27 @@ from ding.utils import SequenceType, REWARD_MODEL_REGISTRY from ding.model import FCEncoder, ConvEncoder from ding.torch_utils.data_helper import to_tensor -import numpy as np +from .reword_model_utils import concat_state_action_pairs +from functools import partial class RepresentationNetwork(nn.Module): def __init__( - self, - obs_shape: Union[int, SequenceType], - hidden_size_list: SequenceType, - activation: Optional[nn.Module] = nn.ReLU() + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + activation: Optional[nn.Module] = nn.ReLU(), + kernel_size: Optional[SequenceType] = [8, 4, 3], + stride: Optional[SequenceType] = [4, 2, 1], ) -> None: super(RepresentationNetwork, self).__init__() if isinstance(obs_shape, int) or len(obs_shape) == 1: self.feature = FCEncoder(obs_shape, hidden_size_list, activation=activation) elif len(obs_shape) == 3: - self.feature = ConvEncoder(obs_shape, hidden_size_list, activation=activation) + self.feature = ConvEncoder( + obs_shape, hidden_size_list, activation=activation, kernel_size=kernel_size, stride=stride + ) else: raise KeyError( "not support obs_shape for pre-defined encoder: {}, please customize your own Representation Network". @@ -81,3 +86,56 @@ def forward(self, obs: torch.Tensor) -> torch.Tensor: reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) reward = torch.exp(-self.sigma * reward) return reward + + +class GailNetwork(nn.Module): + + def __init__( + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + kernel_size: Optional[SequenceType] = [8, 4, 3], + stride: Optional[SequenceType] = [4, 2, 1], + activation: Optional[nn.Module] = nn.ReLU(), + action_shape: Optional[int] = None + ) -> None: + super(GailNetwork, self).__init__() + # Gail will need one more fc layer after RepresentationNetwork, and it will use another activation function + self.act = nn.Sigmoid() + if isinstance(obs_shape, int) or len(obs_shape) == 1: + self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation) + self.concat_state_action_pairs = concat_state_action_pairs + self.fc = nn.Linear(hidden_size_list[0], 1) + self.image_input = False + elif len(obs_shape) == 3: + self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation, kernel_size, stride) + self.fc = nn.Linear(64 + self.action_size, 1) + self.image_input = True + self.action_size = action_shape + self.obs_size = obs_shape + + def learn(self, train_data: torch.Tensor, expert_data: torch.Tensor) -> torch.Tensor: + out_1: torch.Tensor = self.forward(train_data) + loss_1: torch.Tensor = torch.log(out_1 + 1e-8).mean() + out_2: torch.Tensor = self.forward(expert_data) + loss_2: torch.Tensor = torch.log(1 - out_2 + 1e-8).mean() + + loss: torch.Tensor = -(loss_1 + loss_2) + + return loss + + def forward(self, data: torch.Tensor) -> torch.Tensor: + if self.image_input: + # input: x = [B, 4 x 84 x 84 + self.action_size], last element is action + actions = data[:, -self.action_size:] # [B, self.action_size] + # get observations + obs = data[:, :-self.action_size] + obs = obs.reshape([-1] + self.obs_size) # [B, 4, 84, 84] + obs = self.feature(obs) + data = torch.cat([obs, actions], dim=-1) + else: + data = self.feature(data) + + data = self.fc(data) + reward = self.act(data) + return reward diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index 992812111a..eb2d79a637 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -10,7 +10,6 @@ from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel from .network import RedNetwork -from .gail_irl_model import concat_state_action_pairs_one_hot from .reword_model_utils import concat_state_action_pairs diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index 585bb21f34..02a71a3fb0 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -30,10 +30,7 @@ def concat_state_action_pairs( for item in data: states_data.append(item['obs'].flatten()) if one_hot_ and action_size: - new_action = torch.Tensor([int(i == item['action']) for i in range(action_size)]) - print(new_action.shape) action = one_hot(torch.Tensor(item['action']), action_size).squeeze(dim=0) - print(action.shape) actions_data.append(action) else: actions_data.append(item['action']) diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 635dc5e75e..6fe349b5f5 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -16,7 +16,7 @@ from ding.utils.data import default_collate from .base_reward_model import BaseRewardModel -from .rnd_reward_model import collect_states +from .reword_model_utils import collect_states class TrexConvEncoder(nn.Module): From ff60716dd15809765525bf7848ed1c1ab7bc6081 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 11 Apr 2023 17:04:33 -0400 Subject: [PATCH 07/52] fix gail for unit test --- ding/reward_model/gail_irl_model.py | 6 ++++-- ding/reward_model/network.py | 4 ++-- ding/reward_model/reword_model_utils.py | 2 +- ding/reward_model/rnd_reward_model.py | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index f650c6645d..1989858686 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -60,8 +60,10 @@ class GailRewardModel(BaseRewardModel): update_per_collect=100, # (int) How many samples in a training batch. batch_size=64, - # (int) Size of the input: obs_dim + act_dim. + # (int) Size of the input: obs_dim. input_size=4, + # Size of the input: act_dim. + # action_size=6, # (int) Collect steps per iteration. target_new_data_count=64, # (int) Linear model hidden size. @@ -93,7 +95,7 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> elif len(obs_shape) == 3: action_shape = self.cfg.action_size self.reward_model = GailNetwork( - obs_shape, [16, 16, 16, 16, 64], [7, 5, 3, 3], [3, 2, 2, 1], nn.LeakyReLU(), action_shape + obs_shape, [16, 16, 16, 16, 64], [7, 5, 3, 3], [3, 2, 2, 1], nn.LeakyReLU(), action_shape=action_shape ) self.concat_state_action_pairs = partial(concat_state_action_pairs, action_size=action_shape, one_hot_=True) self.reward_model.to(self.device) diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 53513df870..82186b26f2 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -108,11 +108,11 @@ def __init__( self.fc = nn.Linear(hidden_size_list[0], 1) self.image_input = False elif len(obs_shape) == 3: + self.action_size = action_shape + self.obs_size = obs_shape self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation, kernel_size, stride) self.fc = nn.Linear(64 + self.action_size, 1) self.image_input = True - self.action_size = action_shape - self.obs_size = obs_shape def learn(self, train_data: torch.Tensor, expert_data: torch.Tensor) -> torch.Tensor: out_1: torch.Tensor = self.forward(train_data) diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index 02a71a3fb0..276d60e459 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -30,7 +30,7 @@ def concat_state_action_pairs( for item in data: states_data.append(item['obs'].flatten()) if one_hot_ and action_size: - action = one_hot(torch.Tensor(item['action']), action_size).squeeze(dim=0) + action = one_hot(torch.Tensor(item['action']).long(), action_size).squeeze(dim=0) actions_data.append(action) else: actions_data.append(item['action']) diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 7f7f320ad1..1450db54a3 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -133,8 +133,8 @@ def estimate(self, data: list) -> List[Dict]: if self.cfg.obs_norm: obs = obs_norm(obs, self._running_mean_std_rnd_obs, self.cfg, self.device) + rnd_reward = self.reward_model.forward(obs) with torch.no_grad(): - rnd_reward = self.reward_model.forward(obs) # save the rnd_reward statistics into tb_logger self.estimate_cnt_rnd += 1 self.tb_logger.add_scalar('rnd_reward/rnd_reward_max', rnd_reward.max(), self.estimate_cnt_rnd) From 6b80392a5f0a3a1f4162c26aab320a2e4b9ad2f9 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 11 Apr 2023 22:22:28 -0400 Subject: [PATCH 08/52] refactor icm --- ding/reward_model/__init__.py | 4 +- ding/reward_model/gail_irl_model.py | 6 +- ding/reward_model/icm_reward_model.py | 153 ++---------------- ding/reward_model/network.py | 118 +++++++++++++- ding/reward_model/red_irl_model.py | 4 +- ding/reward_model/reword_model_utils.py | 14 +- ding/reward_model/rnd_reward_model.py | 6 +- .../tests/test_reward_model_network.py | 2 +- .../tests/test_reward_model_utils.py | 29 +++- 9 files changed, 175 insertions(+), 161 deletions(-) diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index 3c29cbfa57..dd668e0118 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -13,5 +13,5 @@ from .guided_cost_reward_model import GuidedCostRewardModel from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel from .icm_reward_model import ICMRewardModel -from .network import RepresentationNetwork, RndNetwork, RedNetwork -from .reword_model_utils import concat_state_action_pairs +from .network import RepresentationNetwork, RNDNetwork, REDNetwork, GAILNetwork +from .reword_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward, obs_norm, collect_states diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index 1989858686..b93b711be4 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -11,7 +11,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel from .reword_model_utils import concat_state_action_pairs -from .network import GailNetwork +from .network import GAILNetwork import torch.nn.functional as F from functools import partial @@ -90,11 +90,11 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.tb_logger = tb_logger obs_shape = config.input_size if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.reward_model = GailNetwork(obs_shape, [config.hidden_size], nn.Tanh()) + self.reward_model = GAILNetwork(obs_shape, [config.hidden_size], nn.Tanh()) self.concat_state_action_pairs = concat_state_action_pairs elif len(obs_shape) == 3: action_shape = self.cfg.action_size - self.reward_model = GailNetwork( + self.reward_model = GAILNetwork( obs_shape, [16, 16, 16, 16, 64], [7, 5, 3, 3], [3, 2, 2, 1], nn.LeakyReLU(), action_shape=action_shape ) self.concat_state_action_pairs = partial(concat_state_action_pairs, action_size=action_shape, one_hot_=True) diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index 9cc6e23e9b..89d76daa3b 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -6,10 +6,10 @@ import torch.nn as nn import torch.optim as optim -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY -from ding.model import FCEncoder, ConvEncoder -from ding.torch_utils import one_hot +from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .network import ICMNetwork +from .reword_model_utils import combine_intrinsic_exterinsic_reward def collect_states(iterator: list) -> Tuple[list, list, list]: @@ -26,102 +26,6 @@ def collect_states(iterator: list) -> Tuple[list, list, list]: return states, next_states, actions -class ICMNetwork(nn.Module): - """ - Intrinsic Curiosity Model (ICM Module) - Implementation of: - [1] Curiosity-driven Exploration by Self-supervised Prediction - Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. - https://arxiv.org/pdf/1705.05363.pdf - [2] Code implementation reference: - https://github.com/pathak22/noreward-rl - https://github.com/jcwleo/curiosity-driven-exploration-pytorch - - 1) Embedding observations into a latent space - 2) Predicting the action logit given two consecutive embedded observations - 3) Predicting the next embedded obs, given the embeded former observation and action - """ - - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, action_shape: int) -> None: - super(ICMNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.feature = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.feature = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own ICM model". - format(obs_shape) - ) - self.action_shape = action_shape - feature_output = hidden_size_list[-1] - self.inverse_net = nn.Sequential(nn.Linear(feature_output * 2, 512), nn.ReLU(), nn.Linear(512, action_shape)) - self.residual = nn.ModuleList( - [ - nn.Sequential( - nn.Linear(action_shape + 512, 512), - nn.LeakyReLU(), - nn.Linear(512, 512), - ) for _ in range(8) - ] - ) - self.forward_net_1 = nn.Sequential(nn.Linear(action_shape + feature_output, 512), nn.LeakyReLU()) - self.forward_net_2 = nn.Linear(action_shape + 512, feature_output) - - def forward(self, state: torch.Tensor, next_state: torch.Tensor, - action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - r""" - Overview: - Use observation, next_observation and action to genearte ICM module - Parameter updates with ICMNetwork forward setup. - Arguments: - - state (:obj:`torch.Tensor`): - The current state batch - - next_state (:obj:`torch.Tensor`): - The next state batch - - action_long (:obj:`torch.Tensor`): - The action batch - Returns: - - real_next_state_feature (:obj:`torch.Tensor`): - Run with the encoder. Return the real next_state's embedded feature. - - pred_next_state_feature (:obj:`torch.Tensor`): - Run with the encoder and residual network. Return the predicted next_state's embedded feature. - - pred_action_logit (:obj:`torch.Tensor`): - Run with the encoder. Return the predicted action logit. - Shapes: - - state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' - - next_state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' - - action_long (:obj:`torch.Tensor`): :math:`(B)`, where B is the batch size'' - - real_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size - and M is embedded feature size - - pred_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size - and M is embedded feature size - - pred_action_logit (:obj:`torch.Tensor`): :math:`(B, A)`, where B is the batch size - and A is the ''action_shape'' - """ - action = one_hot(action_long, num=self.action_shape) - encode_state = self.feature(state) - encode_next_state = self.feature(next_state) - # get pred action logit - concat_state = torch.cat((encode_state, encode_next_state), 1) - pred_action_logit = self.inverse_net(concat_state) - # --------------------- - - # get pred next state - pred_next_state_feature_orig = torch.cat((encode_state, action), 1) - pred_next_state_feature_orig = self.forward_net_1(pred_next_state_feature_orig) - - # residual - for i in range(4): - pred_next_state_feature = self.residual[i * 2](torch.cat((pred_next_state_feature_orig, action), 1)) - pred_next_state_feature_orig = self.residual[i * 2 + 1]( - torch.cat((pred_next_state_feature, action), 1) - ) + pred_next_state_feature_orig - pred_next_state_feature = self.forward_net_2(torch.cat((pred_next_state_feature_orig, action), 1)) - real_next_state_feature = encode_next_state - return real_next_state_feature, pred_next_state_feature, pred_action_logit - - @REWARD_MODEL_REGISTRY.register('icm') class ICMRewardModel(BaseRewardModel): """ @@ -211,15 +115,11 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.train_next_states = [] self.train_actions = [] self.opt = optim.Adam(self.reward_model.parameters(), config.learning_rate) - self.ce = nn.CrossEntropyLoss(reduction="mean") - self.forward_mse = nn.MSELoss(reduction='none') self.reverse_scale = config.reverse_scale - self.res = nn.Softmax(dim=-1) self.estimate_cnt_icm = 0 self.train_cnt_icm = 0 - def _train(self) -> None: - self.train_cnt_icm += 1 + def _train(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, float]: train_data_list = [i for i in range(0, len(self.train_states))] train_data_index = random.sample(train_data_list, self.cfg.batch_size) data_states: list = [self.train_states[i] for i in train_data_index] @@ -229,26 +129,23 @@ def _train(self) -> None: data_actions: list = [self.train_actions[i] for i in train_data_index] data_actions: torch.Tensor = torch.cat(data_actions).to(self.device) - real_next_state_feature, pred_next_state_feature, pred_action_logit = self.reward_model( - data_states, data_next_states, data_actions - ) - inverse_loss = self.ce(pred_action_logit, data_actions.long()) - forward_loss = self.forward_mse(pred_next_state_feature, real_next_state_feature.detach()).mean() - self.tb_logger.add_scalar('icm_reward/forward_loss', forward_loss, self.train_cnt_icm) - self.tb_logger.add_scalar('icm_reward/inverse_loss', inverse_loss, self.train_cnt_icm) - action = torch.argmax(self.res(pred_action_logit), -1) - accuracy = torch.sum(action == data_actions.squeeze(-1)).item() / data_actions.shape[0] - self.tb_logger.add_scalar('icm_reward/action_accuracy', accuracy, self.train_cnt_icm) - loss = self.reverse_scale * inverse_loss + forward_loss - self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) + inverse_loss, forward_loss, accuracy = self.reward_model.learn(data_states, data_next_states, data_actions) loss = self.reverse_scale * inverse_loss + forward_loss + self.opt.zero_grad() loss.backward() self.opt.step() + return loss, inverse_loss, forward_loss, accuracy + def train(self) -> None: for _ in range(self.cfg.update_per_collect): - self._train() + loss, inverse_loss, forward_loss, accuracy = self._train() + self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/forward_loss', forward_loss, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/inverse_loss', inverse_loss, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/action_accuracy', accuracy, self.train_cnt_icm) + self.train_cnt_icm += 1 def estimate(self, data: list) -> List[Dict]: # NOTE: deepcopy reward part of data is very important, @@ -259,33 +156,15 @@ def estimate(self, data: list) -> List[Dict]: next_states = torch.stack(next_states).to(self.device) actions = torch.cat(actions).to(self.device) with torch.no_grad(): - real_next_state_feature, pred_next_state_feature, _ = self.reward_model(states, next_states, actions) - raw_icm_reward = self.forward_mse(real_next_state_feature, pred_next_state_feature).mean(dim=1) + raw_icm_reward = self.reward_model.forward(states, next_states, actions) self.estimate_cnt_icm += 1 self.tb_logger.add_scalar('icm_reward/raw_icm_reward_max', raw_icm_reward.max(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_mean', raw_icm_reward.mean(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_min', raw_icm_reward.min(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_std', raw_icm_reward.std(), self.estimate_cnt_icm) icm_reward = (raw_icm_reward - raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) - self.tb_logger.add_scalar('icm_reward/icm_reward_max', icm_reward.max(), self.estimate_cnt_icm) - self.tb_logger.add_scalar('icm_reward/icm_reward_mean', icm_reward.mean(), self.estimate_cnt_icm) - self.tb_logger.add_scalar('icm_reward/icm_reward_min', icm_reward.min(), self.estimate_cnt_icm) - self.tb_logger.add_scalar('icm_reward/icm_reward_std', icm_reward.std(), self.estimate_cnt_icm) - icm_reward = (raw_icm_reward - raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) icm_reward = icm_reward.to(self.device) - for item, icm_rew in zip(train_data_augmented, icm_reward): - if self.intrinsic_reward_type == 'add': - if self.cfg.extrinsic_reward_norm: - item['reward'] = item[ - 'reward'] / self.cfg.extrinsic_reward_norm_max + icm_rew * self.cfg.intrinsic_reward_weight - else: - item['reward'] = item['reward'] + icm_rew * self.cfg.intrinsic_reward_weight - elif self.intrinsic_reward_type == 'new': - item['intrinsic_reward'] = icm_rew - if self.cfg.extrinsic_reward_norm: - item['reward'] = item['reward'] / self.cfg.extrinsic_reward_norm_max - elif self.intrinsic_reward_type == 'assign': - item['reward'] = icm_rew + train_data_augmented = combine_intrinsic_exterinsic_reward(train_data_augmented, icm_reward, self.cfg) return train_data_augmented diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 82186b26f2..8c5ea846ac 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -8,6 +8,7 @@ from ding.utils import SequenceType, REWARD_MODEL_REGISTRY from ding.model import FCEncoder, ConvEncoder from ding.torch_utils.data_helper import to_tensor +from ding.torch_utils import one_hot from .reword_model_utils import concat_state_action_pairs from functools import partial @@ -40,10 +41,10 @@ def forward(self, obs: torch.Tensor) -> torch.Tensor: return feature_output -class RndNetwork(nn.Module): +class RNDNetwork(nn.Module): def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: - super(RndNetwork, self).__init__() + super(RNDNetwork, self).__init__() self.target = RepresentationNetwork(obs_shape, hidden_size_list) self.predictor = RepresentationNetwork(obs_shape, hidden_size_list) @@ -66,7 +67,7 @@ def learn(self, obs: torch.Tensor) -> torch.Tensor: return loss -class RedNetwork(RndNetwork): +class REDNetwork(RNDNetwork): def __init__( self, @@ -88,7 +89,7 @@ def forward(self, obs: torch.Tensor) -> torch.Tensor: return reward -class GailNetwork(nn.Module): +class GAILNetwork(nn.Module): def __init__( self, @@ -99,7 +100,7 @@ def __init__( activation: Optional[nn.Module] = nn.ReLU(), action_shape: Optional[int] = None ) -> None: - super(GailNetwork, self).__init__() + super(GAILNetwork, self).__init__() # Gail will need one more fc layer after RepresentationNetwork, and it will use another activation function self.act = nn.Sigmoid() if isinstance(obs_shape, int) or len(obs_shape) == 1: @@ -139,3 +140,110 @@ def forward(self, data: torch.Tensor) -> torch.Tensor: data = self.fc(data) reward = self.act(data) return reward + + +class ICMNetwork(nn.Module): + """ + Intrinsic Curiosity Model (ICM Module) + Implementation of: + [1] Curiosity-driven Exploration by Self-supervised Prediction + Pathak, Agrawal, Efros, and Darrell - UC Berkeley - ICML 2017. + https://arxiv.org/pdf/1705.05363.pdf + [2] Code implementation reference: + https://github.com/pathak22/noreward-rl + https://github.com/jcwleo/curiosity-driven-exploration-pytorch + + 1) Embedding observations into a latent space + 2) Predicting the action logit given two consecutive embedded observations + 3) Predicting the next embedded obs, given the embeded former observation and action + """ + + def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, action_shape: int) -> None: + super(ICMNetwork, self).__init__() + self.action_shape = action_shape + feature_output = hidden_size_list[-1] + self.feature = RepresentationNetwork(obs_shape, hidden_size_list) + self.inverse_net = nn.Sequential(nn.Linear(feature_output * 2, 512), nn.ReLU(), nn.Linear(512, action_shape)) + self.residual = nn.ModuleList( + [ + nn.Sequential( + nn.Linear(action_shape + 512, 512), + nn.LeakyReLU(), + nn.Linear(512, 512), + ) for _ in range(8) + ] + ) + self.forward_net_1 = RepresentationNetwork(action_shape + feature_output, [512], nn.LeakyReLU()) + self.forward_net_2 = nn.Linear(action_shape + 512, feature_output) + + def _forward(self, state: torch.Tensor, next_state: torch.Tensor, + action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Overview: + Use observation, next_observation and action to genearte ICM module + Parameter updates with ICMNetwork forward setup. + Arguments: + - state (:obj:`torch.Tensor`): + The current state batch + - next_state (:obj:`torch.Tensor`): + The next state batch + - action_long (:obj:`torch.Tensor`): + The action batch + Returns: + - real_next_state_feature (:obj:`torch.Tensor`): + Run with the encoder. Return the real next_state's embedded feature. + - pred_next_state_feature (:obj:`torch.Tensor`): + Run with the encoder and residual network. Return the predicted next_state's embedded feature. + - pred_action_logit (:obj:`torch.Tensor`): + Run with the encoder. Return the predicted action logit. + Shapes: + - state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' + - next_state (:obj:`torch.Tensor`): :math:`(B, N)`, where B is the batch size and N is ''obs_shape'' + - action_long (:obj:`torch.Tensor`): :math:`(B)`, where B is the batch size'' + - real_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size + and M is embedded feature size + - pred_next_state_feature (:obj:`torch.Tensor`): :math:`(B, M)`, where B is the batch size + and M is embedded feature size + - pred_action_logit (:obj:`torch.Tensor`): :math:`(B, A)`, where B is the batch size + and A is the ''action_shape'' + """ + action = one_hot(action_long, num=self.action_shape) + encode_state = self.feature(state) + encode_next_state = self.feature(next_state) + # get pred action logit + concat_state = torch.cat((encode_state, encode_next_state), 1) + pred_action_logit = self.inverse_net(concat_state) + # --------------------- + + # get pred next state + pred_next_state_feature_orig = torch.cat((encode_state, action), 1) + pred_next_state_feature_orig = self.forward_net_1(pred_next_state_feature_orig) + + # residual + for i in range(4): + pred_next_state_feature = self.residual[i * 2](torch.cat((pred_next_state_feature_orig, action), 1)) + pred_next_state_feature_orig = self.residual[i * 2 + 1]( + torch.cat((pred_next_state_feature, action), 1) + ) + pred_next_state_feature_orig + pred_next_state_feature = self.forward_net_2(torch.cat((pred_next_state_feature_orig, action), 1)) + real_next_state_feature = encode_next_state + return real_next_state_feature, pred_next_state_feature, pred_action_logit + + def learn(self, state: torch.Tensor, next_state: torch.Tensor, + action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, float]: + real_next_state_feature, pred_next_state_feature, pred_action_logit = self._forward( + state, next_state, action_long + ) + + inverse_loss = F.cross_entropy(pred_action_logit, action_long.long()) + forward_loss = F.mse_loss(pred_next_state_feature, real_next_state_feature.detach()).mean() + action = torch.argmax(F.softmax(pred_action_logit), -1) + accuracy = torch.sum(action == action_long.squeeze(-1)).item() / action_long.shape[0] + return inverse_loss, forward_loss, accuracy + + def forward(self, state: torch.Tensor, next_state: torch.Tensor, action_long: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + real_next_state_feature, pred_next_state_feature, _ = self._forward(state, next_state, action_long) + reward = F.mse_loss(real_next_state_feature, pred_next_state_feature, reduction="none").mean(dim=1) + + return reward diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index eb2d79a637..bcc816af60 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -9,7 +9,7 @@ from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel -from .network import RedNetwork +from .network import REDNetwork from .reword_model_utils import concat_state_action_pairs @@ -87,7 +87,7 @@ def __init__(self, config: Dict, device: str, tb_logger: 'SummaryWriter') -> Non self.device = device assert device in ["cpu", "cuda"] or "cuda" in device self.tb_logger = tb_logger - self.reward_model = RedNetwork(config.obs_shape, config.action_shape, config.hidden_size_list, config.sigma) + self.reward_model = REDNetwork(config.obs_shape, config.action_shape, config.hidden_size_list, config.sigma) self.reward_model.to(self.device) self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.train_once_flag = False diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index 276d60e459..dced2ab0b8 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -43,23 +43,23 @@ def concat_state_action_pairs( def combine_intrinsic_exterinsic_reward( - train_data_augmented: Any, rnd_reward: List[torch.Tensor], config: EasyDict + train_data_augmented: Any, intrinsic_reward: List[torch.Tensor], config: EasyDict ) -> Any: - for item, rnd_rew in zip(train_data_augmented, rnd_reward): + for item, in_rew in zip(train_data_augmented, intrinsic_reward): if config.intrinsic_reward_type == 'add': if config.extrinsic_reward_norm: item['reward' - ] = item['reward'] / config.extrinsic_reward_norm_max + rnd_rew * config.intrinsic_reward_weight + ] = item['reward'] / config.extrinsic_reward_norm_max + in_rew * config.intrinsic_reward_weight else: - item['reward'] = item['reward'] + rnd_rew * config.intrinsic_reward_weight + item['reward'] = item['reward'] + in_rew * config.intrinsic_reward_weight elif config.intrinsic_reward_type == 'new': - item['intrinsic_reward'] = rnd_rew + item['intrinsic_reward'] = in_rew if config.extrinsic_reward_norm: item['reward'] = item['reward'] / config.extrinsic_reward_norm_max elif config.intrinsic_reward_type == 'assign': - item['reward'] = rnd_rew + item['reward'] = in_rew - return item + return train_data_augmented def collect_states(iterator) -> List: diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 1450db54a3..4588b42bcf 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -8,7 +8,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel from .reword_model_utils import combine_intrinsic_exterinsic_reward, collect_states, obs_norm -from .network import RndNetwork +from .network import RNDNetwork from ding.utils import RunningMeanStd from ding.torch_utils.data_helper import to_tensor import numpy as np @@ -92,7 +92,7 @@ def __init__(self, config: EasyDict, device: str = 'cpu', tb_logger: 'SummaryWri from tensorboardX import SummaryWriter tb_logger = SummaryWriter('rnd_reward_model') self.tb_logger = tb_logger - self.reward_model = RndNetwork(config.obs_shape, config.hidden_size_list) + self.reward_model = RNDNetwork(config.obs_shape, config.hidden_size_list) self.reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] @@ -151,7 +151,7 @@ def estimate(self, data: list) -> List[Dict]: # rewards = torch.stack([data[i]['reward'] for i in range(len(data))]) # rewards = (rewards - torch.min(rewards)) / (torch.max(rewards) - torch.min(rewards)) - item = combine_intrinsic_exterinsic_reward(train_data_augmented, rnd_reward, self.cfg) + train_data_augmented = combine_intrinsic_exterinsic_reward(train_data_augmented, rnd_reward, self.cfg) # save the augmented_reward statistics into tb_logger rew = [item['reward'].cpu().numpy() for item in train_data_augmented] diff --git a/ding/reward_model/tests/test_reward_model_network.py b/ding/reward_model/tests/test_reward_model_network.py index dbc06c7ef0..c1622dcadf 100644 --- a/ding/reward_model/tests/test_reward_model_network.py +++ b/ding/reward_model/tests/test_reward_model_network.py @@ -5,7 +5,7 @@ import torch.optim as optim import torch.nn.functional as F -from ding.reward_model.network import RepresentationNetwork, RndNetwork, RedNetwork +from ding.reward_model.network import RepresentationNetwork @pytest.mark.unittest diff --git a/ding/reward_model/tests/test_reward_model_utils.py b/ding/reward_model/tests/test_reward_model_utils.py index 7b445fb527..93233cb23b 100644 --- a/ding/reward_model/tests/test_reward_model_utils.py +++ b/ding/reward_model/tests/test_reward_model_utils.py @@ -1,7 +1,8 @@ import pytest import torch -from ding.reward_model.reword_model_utils import concat_state_action_pairs +from easydict import EasyDict +from ding.reward_model.reword_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward @pytest.mark.unittest @@ -32,3 +33,29 @@ def test_concat_state_action_pairs_one_hot(): states_actions_test.append(s_a) states_actions_tensor_test = torch.stack(states_actions_test) assert states_actions_tensor.equal(states_actions_tensor_test) + + +@pytest.mark.unittest +def test_combine_intrinsic_exterinsic_reward(): + intrinsic_reward = torch.rand(1) + train_data_augument = [{'obs': torch.rand(5), 'reward': torch.rand(1), 'intrinsic_reward': torch.rand(1)}] + config_list = [ + { + 'intrinsic_reward_type': 'new', + 'intrinsic_reward_weight': 1, + 'extrinsic_reward_norm_max': 1, + 'extrinsic_reward_norm': True, + }, { + 'intrinsic_reward_type': 'assign', + 'intrinsic_reward_weight': 1, + 'extrinsic_reward_norm_max': 1, + 'extrinsic_reward_norm': True, + } + ] + for config in config_list: + config = EasyDict(config) + item = combine_intrinsic_exterinsic_reward(train_data_augument, intrinsic_reward, config) + if config.intrinsic_reward_type == 'new': + assert item['intrinsic_reward'] == intrinsic_reward + elif config.intrinsic_reward_type == 'assign': + assert item['reward'] == intrinsic_reward From 25d49b5d225ee49c396e2f7f9abf44b82e009286 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 12 Apr 2023 00:09:20 -0400 Subject: [PATCH 09/52] fix wrong unit test in test_reward_model_utils --- ding/reward_model/reword_model_utils.py | 1 - .../tests/test_reward_model_utils.py | 20 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index dced2ab0b8..9b6fdefa50 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -58,7 +58,6 @@ def combine_intrinsic_exterinsic_reward( item['reward'] = item['reward'] / config.extrinsic_reward_norm_max elif config.intrinsic_reward_type == 'assign': item['reward'] = in_rew - return train_data_augmented diff --git a/ding/reward_model/tests/test_reward_model_utils.py b/ding/reward_model/tests/test_reward_model_utils.py index 93233cb23b..71eac18a3a 100644 --- a/ding/reward_model/tests/test_reward_model_utils.py +++ b/ding/reward_model/tests/test_reward_model_utils.py @@ -39,8 +39,14 @@ def test_concat_state_action_pairs_one_hot(): def test_combine_intrinsic_exterinsic_reward(): intrinsic_reward = torch.rand(1) train_data_augument = [{'obs': torch.rand(5), 'reward': torch.rand(1), 'intrinsic_reward': torch.rand(1)}] + extrinsic_reward = train_data_augument[0]['reward'] config_list = [ { + 'intrinsic_reward_type': 'add', + 'intrinsic_reward_weight': 1, + 'extrinsic_reward_norm_max': 1, + 'extrinsic_reward_norm': False, + }, { 'intrinsic_reward_type': 'new', 'intrinsic_reward_weight': 1, 'extrinsic_reward_norm_max': 1, @@ -54,8 +60,12 @@ def test_combine_intrinsic_exterinsic_reward(): ] for config in config_list: config = EasyDict(config) - item = combine_intrinsic_exterinsic_reward(train_data_augument, intrinsic_reward, config) - if config.intrinsic_reward_type == 'new': - assert item['intrinsic_reward'] == intrinsic_reward - elif config.intrinsic_reward_type == 'assign': - assert item['reward'] == intrinsic_reward + train_data_augument = combine_intrinsic_exterinsic_reward(train_data_augument, intrinsic_reward, config) + for item in train_data_augument: + if config.intrinsic_reward_type == 'add': + real_reward = intrinsic_reward + extrinsic_reward + assert item['reward'].equal(real_reward) + elif config.intrinsic_reward_type == 'new': + assert item['intrinsic_reward'].equal(intrinsic_reward.squeeze()) + else: + assert item['reward'].equal(intrinsic_reward.squeeze()) From c081ff07aa2a633fe5bb6115a5e109e96c17dc99 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 13 Apr 2023 00:49:36 -0400 Subject: [PATCH 10/52] refactor gcl and pwil --- ding/reward_model/guided_cost_reward_model.py | 21 +++++------- ding/reward_model/network.py | 34 ++++++++++++++++++- ding/reward_model/pwil_irl_model.py | 10 +++--- 3 files changed, 47 insertions(+), 18 deletions(-) diff --git a/ding/reward_model/guided_cost_reward_model.py b/ding/reward_model/guided_cost_reward_model.py index 437e198f53..9ce49f3157 100644 --- a/ding/reward_model/guided_cost_reward_model.py +++ b/ding/reward_model/guided_cost_reward_model.py @@ -10,6 +10,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from ding.utils.data import default_collate from .base_reward_model import BaseRewardModel +from .network import GCLNetwork class GuidedCostNN(nn.Module): @@ -89,11 +90,14 @@ class GuidedCostRewardModel(BaseRewardModel): def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: # noqa super(GuidedCostRewardModel, self).__init__() self.cfg = config - self.action_shape = self.cfg.action_shape assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.reward_model = GuidedCostNN(config.input_size, config.hidden_size) + self.reward_model = GCLNetwork( + config.input_size, [config.hidden_size, config.hidden_size], + output_size=1, + action_shape=config.action_shape + ) self.reward_model.to(self.device) self.opt = optim.Adam(self.reward_model.parameters(), lr=config.learning_rate) @@ -118,16 +122,7 @@ def train(self, expert_demo: torch.Tensor, samp: torch.Tensor, iter, step): samp.extend(expert_demo) expert_demo = default_collate(expert_demo) samp = default_collate(samp) - cost_demo = self.reward_model( - torch.cat([expert_demo['obs'], expert_demo['action'].float().reshape(-1, self.action_shape)], dim=-1) - ) - cost_samp = self.reward_model( - torch.cat([samp['obs'], samp['action'].float().reshape(-1, self.action_shape)], dim=-1) - ) - - prob = samp['prob'].unsqueeze(-1) - loss_IOC = torch.mean(cost_demo) + \ - torch.log(torch.mean(torch.exp(-cost_samp)/(prob+1e-7))) + loss_IOC = self.reward_model.learn(expert_demo, samp) # UPDATING THE COST FUNCTION self.opt.zero_grad() loss_IOC.backward() @@ -142,7 +137,7 @@ def estimate(self, data: list) -> List[Dict]: train_data_augmented = data for i in range(len(train_data_augmented)): with torch.no_grad(): - reward = self.reward_model( + reward = self.reward_model.forward( torch.cat([train_data_augmented[i]['obs'], train_data_augmented[i]['action'].float()]).unsqueeze(0) ).squeeze(0) train_data_augmented[i]['reward'] = -reward diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 8c5ea846ac..326befe624 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -4,11 +4,13 @@ import torch import torch.nn as nn import torch.nn.functional as F +from torch.distributions import Independent, Normal from ding.utils import SequenceType, REWARD_MODEL_REGISTRY from ding.model import FCEncoder, ConvEncoder from ding.torch_utils.data_helper import to_tensor from ding.torch_utils import one_hot +from ding.utils.data import default_collate from .reword_model_utils import concat_state_action_pairs from functools import partial @@ -105,7 +107,6 @@ def __init__( self.act = nn.Sigmoid() if isinstance(obs_shape, int) or len(obs_shape) == 1: self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation) - self.concat_state_action_pairs = concat_state_action_pairs self.fc = nn.Linear(hidden_size_list[0], 1) self.image_input = False elif len(obs_shape) == 3: @@ -247,3 +248,34 @@ def forward(self, state: torch.Tensor, next_state: torch.Tensor, action_long: to reward = F.mse_loss(real_next_state_feature, pred_next_state_feature, reduction="none").mean(dim=1) return reward + + +class GCLNetwork(nn.Module): + + def __init__( + self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, output_size: int, + action_shape: int + ) -> None: + super(GCLNetwork, self).__init__() + self.feature = RepresentationNetwork(obs_shape, hidden_size_list) + self.fc = nn.Linear(hidden_size_list[-1], output_size) + self.action_shape = action_shape + + def forward(self, data: torch.Tensor) -> torch.Tensor: + reward = self.feature(data) + reward = self.fc(reward) + + return reward + + def learn(self, expert_demo: torch.Tensor, samp: torch.Tensor) -> torch.Tensor: + cost_demo = self.forward( + torch.cat([expert_demo['obs'], expert_demo['action'].float().reshape(-1, self.action_shape)], dim=-1) + ) + cost_samp = self.forward( + torch.cat([samp['obs'], samp['action'].float().reshape(-1, self.action_shape)], dim=-1) + ) + prob = samp['prob'].unsqueeze(-1) + loss_IOC = torch.mean(cost_demo) + \ + torch.log(torch.mean(torch.exp(-cost_samp)/(prob+1e-7))) + + return loss_IOC diff --git a/ding/reward_model/pwil_irl_model.py b/ding/reward_model/pwil_irl_model.py index 5fec46b821..ba777f7cb9 100644 --- a/ding/reward_model/pwil_irl_model.py +++ b/ding/reward_model/pwil_irl_model.py @@ -1,4 +1,5 @@ from typing import Dict, List +from ditk import logging import math import random import pickle @@ -6,6 +7,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .reword_model_utils import concat_state_action_pairs def collect_state_action_pairs(iterator): @@ -16,13 +18,12 @@ def collect_state_action_pairs(iterator): Arguments: - iterator (:obj:`Iterable`): Iterables with at least ``obs`` and ``action`` tensor keys. Returns: - - res (:obj:`Torch.tensor`): State and action pairs. + - res (:obj:`List(Tuple(torch.Tensor, torch.Tensor))`): State and action pairs. """ res = [] for item in iterator: state = item['obs'] action = item['action'] - # s_a = torch.cat([state, action.float()], dim=-1) res.append((state, action)) return res @@ -95,6 +96,7 @@ def __init__(self, config: Dict, device: str, tb_logger: 'SummaryWriter') -> Non self.cfg: Dict = config assert device in ["cpu", "cuda"] or "cuda" in device self.device = device + self.tb_logger = tb_logger self.expert_data: List[tuple] = [] self.train_data: List[tuple] = [] # In this algo, model is a dict @@ -114,12 +116,12 @@ def load_expert_data(self) -> None: """ with open(self.cfg.expert_data_path, 'rb') as f: self.expert_data = pickle.load(f) - print("the data size is:", len(self.expert_data)) + logging.info("the data size is: %d", len(self.expert_data)) sample_size = min(self.cfg.sample_size, len(self.expert_data)) self.expert_data = random.sample(self.expert_data, sample_size) self.expert_data = [(item['obs'], item['action']) for item in self.expert_data] self.expert_s, self.expert_a = list(zip(*self.expert_data)) - print('the expert data demonstrations is:', len(self.expert_data)) + logging.info('the expert data demonstrations is: %d', len(self.expert_data)) def collect_data(self, data: list) -> None: """ From f1218cdeb0db4b51ca52e7d8deffe5af9f43f712 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 13 Apr 2023 00:55:06 -0400 Subject: [PATCH 11/52] refactor pdeil --- ding/reward_model/pdeil_irl_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ding/reward_model/pdeil_irl_model.py b/ding/reward_model/pdeil_irl_model.py index 0bd42c8fdc..6e47068e4d 100644 --- a/ding/reward_model/pdeil_irl_model.py +++ b/ding/reward_model/pdeil_irl_model.py @@ -1,6 +1,7 @@ import numpy as np import torch import pickle +from ditk import logging from typing import List, Dict import scipy.stats as stats try: @@ -163,7 +164,7 @@ def estimate(self, data: list) -> List[Dict]: s = torch.stack([item['obs'] for item in train_data_augmented], dim=0) a = torch.stack([item['action'] for item in train_data_augmented], dim=0) if self.p_u_s is None: - print("you need to train you reward model first") + logging.warning("you need to train you reward model first") for item in train_data_augmented: item['reward'].zero_() else: From d9060c2ab92d968d37c60739bf16313828e0ec28 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 13 Apr 2023 01:49:36 -0400 Subject: [PATCH 12/52] add hidden_size_list to gail --- dizoo/atari/config/serial/pong/pong_gail_dqn_config.py | 6 ++++-- .../bipedalwalker/config/bipedalwalker_gail_sac_config.py | 2 +- .../box2d/lunarlander/config/lunarlander_gail_dqn_config.py | 2 +- .../cartpole/config/cartpole_dqn_gail_config.py | 2 +- dizoo/mujoco/config/ant_gail_sac_config.py | 2 +- dizoo/mujoco/config/halfcheetah_gail_sac_config.py | 2 +- dizoo/mujoco/config/hopper_gail_sac_config.py | 2 +- 7 files changed, 10 insertions(+), 8 deletions(-) diff --git a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py index d014681a65..f11a55ecc1 100644 --- a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py +++ b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py @@ -14,7 +14,9 @@ reward_model=dict( type='gail', input_size=[4, 84, 84], - hidden_size=128, + hidden_size_list=[16, 16, 16, 16, 64], + kernel_size=[7, 5, 3, 3], + stride=[3, 2, 2, 1], batch_size=64, learning_rate=1e-3, update_per_collect=100, @@ -87,4 +89,4 @@ max_env_step=1000000, seed=0, collect_data=True - ) \ No newline at end of file + ) diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py index 4ef3d1b068..5ceefdb02f 100755 --- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py @@ -18,7 +18,7 @@ reward_model=dict( type='gail', input_size=obs_shape + act_shape, - hidden_size=64, + hidden_size_list=[64], batch_size=64, learning_rate=1e-3, update_per_collect=100, diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py index 855f845980..c69506b5ba 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py @@ -15,7 +15,7 @@ reward_model=dict( type='gail', input_size=9, - hidden_size=64, + hidden_size_list=[64], batch_size=64, learning_rate=1e-3, update_per_collect=100, diff --git a/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py b/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py index b438e648e3..df5cf18ad9 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py @@ -11,7 +11,7 @@ reward_model=dict( type='gail', input_size=5, - hidden_size=64, + hidden_size_list=[64], batch_size=64, learning_rate=1e-3, update_per_collect=100, diff --git a/dizoo/mujoco/config/ant_gail_sac_config.py b/dizoo/mujoco/config/ant_gail_sac_config.py index b7e7cd7d06..53fd375318 100644 --- a/dizoo/mujoco/config/ant_gail_sac_config.py +++ b/dizoo/mujoco/config/ant_gail_sac_config.py @@ -15,7 +15,7 @@ ), reward_model=dict( input_size=obs_shape + act_shape, - hidden_size=256, + hidden_size_list=[256], batch_size=64, learning_rate=1e-3, update_per_collect=100, diff --git a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py index bf64cd8c64..2b0ee0a5a1 100644 --- a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py @@ -15,7 +15,7 @@ ), reward_model=dict( input_size=obs_shape + act_shape, - hidden_size=256, + hidden_size_list=[256], batch_size=64, learning_rate=1e-3, update_per_collect=100, diff --git a/dizoo/mujoco/config/hopper_gail_sac_config.py b/dizoo/mujoco/config/hopper_gail_sac_config.py index 26ef8b3816..b82a075e0c 100644 --- a/dizoo/mujoco/config/hopper_gail_sac_config.py +++ b/dizoo/mujoco/config/hopper_gail_sac_config.py @@ -15,7 +15,7 @@ ), reward_model=dict( input_size=obs_shape + act_shape, - hidden_size=256, + hidden_size_list=[256], batch_size=64, learning_rate=1e-3, update_per_collect=100, From 179182ab3ac1405a8f5e704cd2fcf501066b5815 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 13 Apr 2023 02:00:55 -0400 Subject: [PATCH 13/52] change gail test for new config --- .../tests/test_serial_entry_reward_model.py | 2 +- ding/reward_model/gail_irl_model.py | 29 ++++++++++++++----- .../reward_model/tests/test_gail_irl_model.py | 6 ++-- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index ceaf9dc665..de07b67d31 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -20,7 +20,7 @@ { 'type': 'gail', 'input_size': 5, - 'hidden_size': 64, + 'hidden_size_list': [64], 'batch_size': 64, }, { diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index b93b711be4..2db47be052 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -40,10 +40,15 @@ class GailRewardModel(BaseRewardModel): | | obs_dim + act_dim | 7 | ``target_new_`` int 64 | Collect steps per iteration | | ``data_count`` | | - 8 | ``hidden_size`` int 128 | Linear model hidden size | - 9 | ``collect_count`` int 100000 | Expert dataset size | One entry is a (s,a) + 8 | ``hidden_size`` list( [32,32,64] | Sequence of ``hidden_size`` | + | ``_list`` int) | of reward network. | + 9 | ``kernel_size`` list( [5,3] | kernel size list | only used in image + | int) | | input + 10 | ``stride`` list( | stride size list | only used in image + | int) | | input + 11 | ``collect_count`` int 100000 | Expert dataset size | One entry is a (s,a) | | | tuple - 10 | ``clear_buffer_`` int 1 | clear buffer per fixed iters | make sure replay + 12 | ``clear_buffer_`` int 1 | clear buffer per fixed iters | make sure replay | ``per_iters`` | buffer's data count | | isn't too few. | | (code work in entry) @@ -66,8 +71,13 @@ class GailRewardModel(BaseRewardModel): # action_size=6, # (int) Collect steps per iteration. target_new_data_count=64, - # (int) Linear model hidden size. - hidden_size=128, + # (list(int)) Sequence of ``hidden_size`` of reward network. + # if the input is vector, hidden_Size_list = [hidden_size] + hidden_size_list=[16, 16, 16, 16, 64], + # (list(int)) kernel size list, used for image input, size should be len(hidden_size_list) - 1 + kernel_size=[7, 5, 3, 3], + # (list(int)) stride size list, used for image input, size should be len(hidden_size_list) - 1 + stride=[3, 2, 2, 1], # (int) Expert dataset size. collect_count=100000, # (int) Clear buffer per fixed iters. @@ -90,12 +100,17 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.tb_logger = tb_logger obs_shape = config.input_size if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.reward_model = GAILNetwork(obs_shape, [config.hidden_size], nn.Tanh()) + self.reward_model = GAILNetwork(obs_shape, config.hidden_size_list, nn.Tanh()) self.concat_state_action_pairs = concat_state_action_pairs elif len(obs_shape) == 3: action_shape = self.cfg.action_size self.reward_model = GAILNetwork( - obs_shape, [16, 16, 16, 16, 64], [7, 5, 3, 3], [3, 2, 2, 1], nn.LeakyReLU(), action_shape=action_shape + obs_shape, + config.hidden_size_list, + config.kernel_size, + config.stride, + nn.LeakyReLU(), + action_shape=action_shape ) self.concat_state_action_pairs = partial(concat_state_action_pairs, action_size=action_shape, one_hot_=True) self.reward_model.to(self.device) diff --git a/ding/reward_model/tests/test_gail_irl_model.py b/ding/reward_model/tests/test_gail_irl_model.py index bac9d3d950..6bf72d68c2 100644 --- a/ding/reward_model/tests/test_gail_irl_model.py +++ b/ding/reward_model/tests/test_gail_irl_model.py @@ -23,7 +23,7 @@ cfg1 = dict( input_size=obs_space_1d + 1, - hidden_size=64, + hidden_size_list=[64], batch_size=5, learning_rate=1e-3, update_per_collect=2, @@ -32,7 +32,9 @@ cfg2 = dict( input_size=obs_space_3d, - hidden_size=64, + hidden_size_list=[16, 16, 16, 16, 64], + kernel_size=[7, 5, 3, 3], + stride=[3, 2, 2, 1], batch_size=5, learning_rate=1e-3, update_per_collect=2, From d0677317311933db24a3c4add78b6a17ccbcd6f0 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 14 Apr 2023 01:49:54 -0400 Subject: [PATCH 14/52] refactor trex network --- .../serial_entry_preference_based_irl.py | 2 +- .../test_serial_entry_preference_based_irl.py | 21 -- ding/reward_model/__init__.py | 2 +- ding/reward_model/network.py | 49 +++- ding/reward_model/trex_reward_model.py | 221 +++++------------- .../config/cartpole_trex_offppo_config.py | 1 + 6 files changed, 106 insertions(+), 190 deletions(-) diff --git a/ding/entry/serial_entry_preference_based_irl.py b/ding/entry/serial_entry_preference_based_irl.py index 682e662baa..85caba7ee7 100644 --- a/ding/entry/serial_entry_preference_based_irl.py +++ b/ding/entry/serial_entry_preference_based_irl.py @@ -79,7 +79,7 @@ def serial_pipeline_preference_based_irl( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) - reward_model = create_reward_model(cfg_bak, policy.collect_mode.get_attribute('device'), tb_logger) + reward_model = create_reward_model(cfg, policy.collect_mode.get_attribute('device'), tb_logger) reward_model.train() # ========== # Main loop diff --git a/ding/entry/tests/test_serial_entry_preference_based_irl.py b/ding/entry/tests/test_serial_entry_preference_based_irl.py index 7e9198f929..89b457b0ef 100644 --- a/ding/entry/tests/test_serial_entry_preference_based_irl.py +++ b/ding/entry/tests/test_serial_entry_preference_based_irl.py @@ -39,24 +39,3 @@ def test_serial_pipeline_trex(): assert False, "pipeline fail" finally: os.popen('rm -rf test_serial_pipeline_trex*') - - -B = 4 -C, H, W = 3, 128, 128 - - -@pytest.mark.unittest -class TestEncoder: - - def output_check(self, model, outputs): - loss = outputs.sum() - is_differentiable(loss, model) - - def test_conv_encoder(self): - inputs = torch.randn(B, C, H, W) - model = TrexConvEncoder((C, H, W)) - print(model) - outputs = model(inputs) - self.output_check(model, outputs) - print(outputs.shape) - assert outputs.shape == (B, 1) diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index dd668e0118..c1914116ac 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -13,5 +13,5 @@ from .guided_cost_reward_model import GuidedCostRewardModel from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel from .icm_reward_model import ICMRewardModel -from .network import RepresentationNetwork, RNDNetwork, REDNetwork, GAILNetwork +from .network import RepresentationNetwork, RNDNetwork, REDNetwork, GAILNetwork, ICMNetwork, GCLNetwork, TREXNetwork from .reword_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward, obs_norm, collect_states diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 326befe624..1ec0209583 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -97,8 +97,8 @@ def __init__( self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, - kernel_size: Optional[SequenceType] = [8, 4, 3], - stride: Optional[SequenceType] = [4, 2, 1], + kernel_size: Optional[SequenceType] = None, + stride: Optional[SequenceType] = None, activation: Optional[nn.Module] = nn.ReLU(), action_shape: Optional[int] = None ) -> None: @@ -279,3 +279,48 @@ def learn(self, expert_demo: torch.Tensor, samp: torch.Tensor) -> torch.Tensor: torch.log(torch.mean(torch.exp(-cost_samp)/(prob+1e-7))) return loss_IOC + + +class TREXNetwork(nn.Module): + + def __init__( + self, + obs_shape: Union[int, SequenceType], + hidden_size_list: SequenceType, + kernel_size: Optional[SequenceType] = None, + stride: Optional[SequenceType] = None, + activation: Optional[nn.Module] = nn.ReLU(), + l1_reg: Optional[float] = 0, + ) -> None: + super(TREXNetwork, self).__init__() + self.input_size = obs_shape + self.l1_reg = l1_reg + self.output_size = hidden_size_list[-1] + hidden_size_list = hidden_size_list[:-1] + self.feature = RepresentationNetwork(obs_shape, hidden_size_list, activation, kernel_size, stride) + self.act = activation + self.fc = nn.Linear(hidden_size_list[-1], self.output_size) + + def forward(self, data: torch.Tensor) -> torch.Tensor: + reward = self.feature(data) + if isinstance(self.input_size, int) is False and len(self.input_size) == 3: + reward = self.act(reward) + reward = self.fc(reward) + return reward + + def learn(self, traj_i: torch.Tensor, traj_j: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: + outputs, total_abs_reward = self.get_outputs_abs_reward(traj_i, traj_j) + outputs = outputs.unsqueeze(0) + loss = F.cross_entropy(outputs, labels) + self.l1_reg * total_abs_reward + return loss + + def get_outputs_abs_reward(self, traj_i: torch.Tensor, traj_j: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + reward_i = self.forward(traj_i) + reward_j = self.forward(traj_j) + + cum_r_i = torch.sum(reward_i) + cum_r_j = torch.sum(reward_j) + outputs = torch.cat((cum_r_i.unsqueeze(0), cum_r_j.unsqueeze(0)), 0) + total_abs_reward = torch.sum(torch.abs(reward_i)) + torch.sum(torch.abs(reward_j)) + + return outputs, total_abs_reward diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 6fe349b5f5..8860318222 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -1,6 +1,7 @@ from copy import deepcopy from typing import Tuple, Optional, List, Dict from easydict import EasyDict +from ditk import logging import pickle import os import numpy as np @@ -17,116 +18,7 @@ from .base_reward_model import BaseRewardModel from .reword_model_utils import collect_states - - -class TrexConvEncoder(nn.Module): - r""" - Overview: - The ``Convolution Encoder`` used in models. Used to encoder raw 2-dim observation. - Interfaces: - ``__init__``, ``forward`` - """ - - def __init__( - self, - obs_shape: SequenceType, - hidden_size_list: SequenceType = [16, 16, 16, 16, 64, 1], - activation: Optional[nn.Module] = nn.LeakyReLU() - ) -> None: - r""" - Overview: - Init the Trex Convolution Encoder according to arguments. TrexConvEncoder is different \ - from the ConvEncoder in model.common.encoder, their stride and kernel size parameters \ - are different - Arguments: - - obs_shape (:obj:`SequenceType`): Sequence of ``in_channel``, some ``output size`` - - hidden_size_list (:obj:`SequenceType`): The collection of ``hidden_size`` - - activation (:obj:`nn.Module`): - The type of activation to use in the conv ``layers``, - if ``None`` then default set to ``nn.LeakyReLU()`` - """ - super(TrexConvEncoder, self).__init__() - self.obs_shape = obs_shape - self.act = activation - self.hidden_size_list = hidden_size_list - - layers = [] - kernel_size = [7, 5, 3, 3] - stride = [3, 2, 1, 1] - input_size = obs_shape[0] # in_channel - for i in range(len(kernel_size)): - layers.append(nn.Conv2d(input_size, hidden_size_list[i], kernel_size[i], stride[i])) - layers.append(self.act) - input_size = hidden_size_list[i] - layers.append(nn.Flatten()) - self.main = nn.Sequential(*layers) - - flatten_size = self._get_flatten_size() - self.mid = nn.Sequential( - nn.Linear(flatten_size, hidden_size_list[-2]), self.act, - nn.Linear(hidden_size_list[-2], hidden_size_list[-1]) - ) - - def _get_flatten_size(self) -> int: - r""" - Overview: - Get the encoding size after ``self.main`` to get the number of ``in-features`` to feed to ``nn.Linear``. - Arguments: - - x (:obj:`torch.Tensor`): Encoded Tensor after ``self.main`` - Returns: - - outputs (:obj:`torch.Tensor`): Size int, also number of in-feature - """ - test_data = torch.randn(1, *self.obs_shape) - with torch.no_grad(): - output = self.main(test_data) - return output.shape[1] - - def forward(self, x: torch.Tensor) -> torch.Tensor: - r""" - Overview: - Return embedding tensor of the env observation - Arguments: - - x (:obj:`torch.Tensor`): Env raw observation - Returns: - - outputs (:obj:`torch.Tensor`): Embedding tensor - """ - x = self.main(x) - x = self.mid(x) - return x - - -class TrexModel(nn.Module): - - def __init__(self, obs_shape): - super(TrexModel, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.encoder = nn.Sequential(FCEncoder(obs_shape, [512, 64]), nn.Linear(64, 1)) - # Conv Encoder - elif len(obs_shape) == 3: - self.encoder = TrexConvEncoder(obs_shape) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own Trex model". - format(obs_shape) - ) - - def cum_return(self, traj: torch.Tensor, mode: str = 'sum') -> Tuple[torch.Tensor, torch.Tensor]: - '''calculate cumulative return of trajectory''' - r = self.encoder(traj) - if mode == 'sum': - sum_rewards = torch.sum(r) - sum_abs_rewards = torch.sum(torch.abs(r)) - return sum_rewards, sum_abs_rewards - elif mode == 'batch': - return r, torch.abs(r) - else: - raise KeyError("not support mode: {}, please choose mode=sum or mode=batch".format(mode)) - - def forward(self, traj_i: torch.Tensor, traj_j: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - '''compute cumulative return for each trajectory and return logits''' - cum_r_i, abs_r_i = self.cum_return(traj_i) - cum_r_j, abs_r_j = self.cum_return(traj_j) - return torch.cat((cum_r_i.unsqueeze(0), cum_r_j.unsqueeze(0)), 0), abs_r_i + abs_r_j +from .network import TREXNetwork @REWARD_MODEL_REGISTRY.register('trex') @@ -179,7 +71,11 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device in ["cpu", "cuda"] or "cuda" in device self.device = device self.tb_logger = tb_logger - self.reward_model = TrexModel(self.cfg.policy.model.obs_shape) + kernel_size = config.reward_model.kernel_size if 'kernel_size' in config else None + stride = config.reward_model.stride if 'stride' in config else None + self.reward_model = TREXNetwork( + self.cfg.policy.model.obs_shape, config.reward_model.hidden_size_list, kernel_size, stride + ) self.reward_model.to(self.device) self.pre_expert_data = [] self.train_data = [] @@ -216,8 +112,8 @@ def load_expert_data(self) -> None: self.learning_returns = pickle.load(f) self.create_training_data() - self._logger.info("num_training_obs: {}".format(len(self.training_obs))) - self._logger.info("num_labels: {}".format(len(self.training_labels))) + logging.info("num_training_obs: {}".format(len(self.training_obs))) + logging.info("num_labels: {}".format(len(self.training_labels))) def create_training_data(self): num_trajs = self.num_trajs @@ -229,10 +125,10 @@ def create_training_data(self): for i in range(len(self.pre_expert_data)): demo_lengths.append([len(d) for d in self.pre_expert_data[i]]) - self._logger.info("demo_lengths: {}".format(demo_lengths)) + logging.info("demo_lengths: {}".format(demo_lengths)) max_snippet_length = min(np.min(demo_lengths), max_snippet_length) - self._logger.info("min snippet length: {}".format(min_snippet_length)) - self._logger.info("max snippet length: {}".format(max_snippet_length)) + logging.info("min snippet length: {}".format(min_snippet_length)) + logging.info("max snippet length: {}".format(max_snippet_length)) # collect training data max_traj_length = 0 @@ -285,57 +181,56 @@ def create_training_data(self): label = int(bi <= bj) self.training_obs.append((traj_i, traj_j)) self.training_labels.append(label) - self._logger.info(("maximum traj length: {}".format(max_traj_length))) + logging.info(("maximum traj length: {}".format(max_traj_length))) return self.training_obs, self.training_labels - def _train(self): + def _train(self, training_obs: Tuple, training_labels: Tuple) -> float: # check if gpu available device = self.device # torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assume that we are on a CUDA machine, then this should print a CUDA device: - self._logger.info("device: {}".format(device)) + logging.info("device: {}".format(device)) + cum_loss = 0.0 + for i in range(len(training_labels)): + + # traj_i, traj_j has the same length, however, they change as i increases + traj_i, traj_j = training_obs[i] # traj_i is a list of array generated by env.step + traj_i = np.array(traj_i) + traj_j = np.array(traj_j) + traj_i = torch.from_numpy(traj_i).float().to(device) + traj_j = torch.from_numpy(traj_j).float().to(device) + + # training_labels[i] is a boolean integer: 0 or 1 + labels = torch.tensor([training_labels[i]]).to(device) + + # forward + backward + zero out gradient + optimize + loss = self.reward_model.learn(traj_i, traj_j, labels) + self.opt.zero_grad() + loss.backward() + self.opt.step() + + # print stats to see if learning + item_loss = loss.item() + cum_loss += item_loss + return cum_loss + # if not os.path.exists(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')): + # os.makedirs(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')) + # torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, 'ckpt_reward_model/latest.pth.tar')) + # logging.info("finished training") + + def train(self): + # check if gpu available + device = self.device # torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + # Assume that we are on a CUDA machine, then this should print a CUDA device: + logging.info("device: {}".format(device)) training_inputs, training_outputs = self.training_obs, self.training_labels - loss_criterion = nn.CrossEntropyLoss() cum_loss = 0.0 training_data = list(zip(training_inputs, training_outputs)) - for epoch in range(self.cfg.reward_model.update_per_collect): # todo + for epoch in range(self.cfg.reward_model.update_per_collect): np.random.shuffle(training_data) training_obs, training_labels = zip(*training_data) - for i in range(len(training_labels)): - - # traj_i, traj_j has the same length, however, they change as i increases - traj_i, traj_j = training_obs[i] # traj_i is a list of array generated by env.step - traj_i = np.array(traj_i) - traj_j = np.array(traj_j) - traj_i = torch.from_numpy(traj_i).float().to(device) - traj_j = torch.from_numpy(traj_j).float().to(device) - - # training_labels[i] is a boolean integer: 0 or 1 - labels = torch.tensor([training_labels[i]]).to(device) - - # forward + backward + zero out gradient + optimize - outputs, abs_rewards = self.reward_model.forward(traj_i, traj_j) - outputs = outputs.unsqueeze(0) - loss = loss_criterion(outputs, labels) + self.l1_reg * abs_rewards - self.opt.zero_grad() - loss.backward() - self.opt.step() - - # print stats to see if learning - item_loss = loss.item() - cum_loss += item_loss - if i % 100 == 99: - self._logger.info("[epoch {}:{}] loss {}".format(epoch, i, cum_loss)) - self._logger.info("abs_returns: {}".format(abs_rewards)) - cum_loss = 0.0 - self._logger.info("check pointing") - if not os.path.exists(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')): - os.makedirs(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')) - torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, 'ckpt_reward_model/latest.pth.tar')) - self._logger.info("finished training") - - def train(self): - self._train() + cum_loss = self._train(training_obs, training_labels) + logging.info("[epoch {}] loss {}".format(epoch, cum_loss)) # print out predicted cumulative returns and actual returns sorted_returns = sorted(self.learning_returns, key=lambda s: s[0]) demonstrations = [ @@ -344,7 +239,7 @@ def train(self): with torch.no_grad(): pred_returns = [self.predict_traj_return(self.reward_model, traj[0]) for traj in demonstrations] for i, p in enumerate(pred_returns): - self._logger.info("{} {} {}".format(i, p, sorted_returns[i][0])) + logging.info("{} {} {}".format(i, p, sorted_returns[i][0])) info = { "demo_length": [len(d[0]) for d in self.pre_expert_data], "min_snippet_length": self.min_snippet_length, @@ -353,18 +248,14 @@ def train(self): "lem_num_labels": len(self.training_labels), "accuracy": self.calc_accuracy(self.reward_model, self.training_obs, self.training_labels), } - self._logger.info( - "accuracy and comparison:\n{}".format('\n'.join(['{}: {}'.format(k, v) for k, v in info.items()])) - ) + logging.info("accuracy and comparison:\n{}".format('\n'.join(['{}: {}'.format(k, v) for k, v in info.items()]))) def predict_traj_return(self, net, traj): device = self.device # torch.set_printoptions(precision=20) # torch.use_deterministic_algorithms(True) with torch.no_grad(): - rewards_from_obs = net.cum_return( - torch.from_numpy(np.array(traj)).float().to(device), mode='batch' - )[0].squeeze().tolist() + rewards_from_obs = net.forward(torch.from_numpy(np.array(traj)).float().to(device)).squeeze().tolist() # rewards_from_obs1 = net.cum_return(torch.from_numpy(np.array([traj[0]])).float().to(device))[0].item() # different precision return sum(rewards_from_obs) # rewards_from_obs is a list of floats @@ -383,7 +274,7 @@ def calc_accuracy(self, reward_network, training_inputs, training_outputs): traj_j = torch.from_numpy(traj_j).float().to(device) #forward to get logits - outputs, abs_return = reward_network.forward(traj_i, traj_j) + outputs, abs_return = reward_network.get_outputs_abs_reward(traj_i, traj_j) _, pred_label = torch.max(outputs, 0) if pred_label.item() == label: num_correct += 1. @@ -412,7 +303,7 @@ def estimate(self, data: list) -> List[Dict]: res = collect_states(train_data_augmented) res = torch.stack(res).to(self.device) with torch.no_grad(): - sum_rewards, sum_abs_rewards = self.reward_model.cum_return(res, mode='batch') + sum_rewards = self.reward_model.forward(res) for item, rew in zip(train_data_augmented, sum_rewards): # TODO optimise this loop as well ? item['reward'] = rew diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index b58535f900..12db6d7e82 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -20,6 +20,7 @@ expert_model_path='abs model path', reward_model_path='abs data path + ./cartpole.params', data_path='abs data path', + hidden_size_list=[512, 64, 1] ), policy=dict( cuda=False, From 29f0d558badf486756bb94ec21a37114f00b40d8 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 14 Apr 2023 14:15:26 -0400 Subject: [PATCH 15/52] fix style and wrong import --- ding/entry/serial_entry_preference_based_irl.py | 2 +- ding/entry/tests/test_serial_entry_preference_based_irl.py | 2 -- ding/reward_model/trex_reward_model.py | 3 ++- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ding/entry/serial_entry_preference_based_irl.py b/ding/entry/serial_entry_preference_based_irl.py index 85caba7ee7..682e662baa 100644 --- a/ding/entry/serial_entry_preference_based_irl.py +++ b/ding/entry/serial_entry_preference_based_irl.py @@ -79,7 +79,7 @@ def serial_pipeline_preference_based_irl( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) - reward_model = create_reward_model(cfg, policy.collect_mode.get_attribute('device'), tb_logger) + reward_model = create_reward_model(cfg_bak, policy.collect_mode.get_attribute('device'), tb_logger) reward_model.train() # ========== # Main loop diff --git a/ding/entry/tests/test_serial_entry_preference_based_irl.py b/ding/entry/tests/test_serial_entry_preference_based_irl.py index 89b457b0ef..3b577ce916 100644 --- a/ding/entry/tests/test_serial_entry_preference_based_irl.py +++ b/ding/entry/tests/test_serial_entry_preference_based_irl.py @@ -12,8 +12,6 @@ from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config,\ cartpole_ppo_offpolicy_create_config from ding.entry.application_entry_trex_collect_data import trex_collecting_data -from ding.reward_model.trex_reward_model import TrexConvEncoder -from ding.torch_utils import is_differentiable @pytest.mark.unittest diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 8860318222..37ca1d2a15 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -214,7 +214,8 @@ def _train(self, training_obs: Tuple, training_labels: Tuple) -> float: return cum_loss # if not os.path.exists(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')): # os.makedirs(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')) - # torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, 'ckpt_reward_model/latest.pth.tar')) + # torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, + # 'ckpt_reward_model/latest.pth.tar')) # logging.info("finished training") def train(self): From 4ec0bd31ac87ca39e39b7b520aec9678df96ac59 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 14 Apr 2023 14:35:31 -0400 Subject: [PATCH 16/52] fix style for trex --- ding/reward_model/trex_reward_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 37ca1d2a15..2d32074b49 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -214,7 +214,7 @@ def _train(self, training_obs: Tuple, training_labels: Tuple) -> float: return cum_loss # if not os.path.exists(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')): # os.makedirs(os.path.join(self.cfg.exp_name, 'ckpt_reward_model')) - # torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, + # torch.save(self.reward_model.state_dict(), os.path.join(self.cfg.exp_name, # 'ckpt_reward_model/latest.pth.tar')) # logging.info("finished training") From 660af32a9fdd56178bb7c124fdfdc4bd4848668f Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 17 Apr 2023 00:48:20 -0400 Subject: [PATCH 17/52] fix unit test for trex onppo --- .../cartpole/config/cartpole_trex_onppo_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 71b4d4a136..14ada190a6 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -20,6 +20,7 @@ expert_model_path='abs model path', reward_model_path='abs data path + ./cartpole.params', data_path='abs data path', + hidden_size_list=[512, 64, 1] ), policy=dict( cuda=False, From b4e81dde51f30c679793824142d26fa56ad3fa38 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 21 Apr 2023 04:52:47 -0400 Subject: [PATCH 18/52] refactor ngu and provide cartpole config file --- ding/reward_model/network.py | 112 ++++++++++- ding/reward_model/ngu_reward_model.py | 181 +++--------------- .../cartpole/config/cartpole_bc_config.py | 2 +- .../config/cartpole_ngu_config copy.py | 126 ++++++++++++ 4 files changed, 261 insertions(+), 160 deletions(-) create mode 100644 dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 1ec0209583..1182d70cf1 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -10,6 +10,7 @@ from ding.model import FCEncoder, ConvEncoder from ding.torch_utils.data_helper import to_tensor from ding.torch_utils import one_hot +from ding.utils import RunningMeanStd from ding.utils.data import default_collate from .reword_model_utils import concat_state_action_pairs from functools import partial @@ -53,12 +54,13 @@ def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: Sequen for param in self.target.parameters(): param.requires_grad = False - def forward(self, obs: torch.Tensor) -> torch.Tensor: + def forward(self, obs: torch.Tensor, norm: Optional[bool] = True) -> torch.Tensor: with torch.no_grad(): predict_feature = self.predictor(obs) target_feature = self.target(obs) reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) - reward = (reward - reward.min()) / (reward.max() - reward.min() + 1e-8) + if norm: + reward = (reward - reward.min()) / (reward.max() - reward.min() + 1e-8) return reward def learn(self, obs: torch.Tensor) -> torch.Tensor: @@ -324,3 +326,109 @@ def get_outputs_abs_reward(self, traj_i: torch.Tensor, traj_j: torch.Tensor) -> total_abs_reward = torch.sum(torch.abs(reward_i)) + torch.sum(torch.abs(reward_j)) return outputs, total_abs_reward + + +class InverseNetwork(nn.Module): + + def __init__( + self, obs_shape: Union[int, SequenceType], action_shape: int, hidden_size_list: SequenceType, device: str + ) -> None: + super(InverseNetwork, self).__init__() + self._running_mean_std_episodic_dist = RunningMeanStd(epsilon=1e-4) + self.embedding_net = RepresentationNetwork(obs_shape, hidden_size_list) + self.obs_shape = obs_shape + self.device = device + self.inverse_net = nn.Sequential( + nn.Linear(hidden_size_list[-1] * 2, 512), nn.ReLU(inplace=True), nn.Linear(512, action_shape) + ) + + def _compute_intrinsic_reward( + self, + episodic_memory: List, + current_controllable_state: torch.Tensor, + k=10, + kernel_cluster_distance=0.008, + kernel_epsilon=0.0001, + c=0.001, + siminarity_max=8, + ) -> torch.Tensor: + # this function is modified from https://github.com/Coac/never-give-up/blob/main/embedding_model.py + state_dist = torch.cdist(current_controllable_state.unsqueeze(0), episodic_memory, p=2).squeeze(0).sort()[0][:k] + self._running_mean_std_episodic_dist.update(state_dist.cpu().numpy()) + state_dist = state_dist / (self._running_mean_std_episodic_dist.mean + 1e-11) + + state_dist = torch.clamp(state_dist - kernel_cluster_distance, min=0, max=None) + kernel = kernel_epsilon / (state_dist + kernel_epsilon) + s = torch.sqrt(torch.clamp(torch.sum(kernel), min=0, max=None)) + c + + if s > siminarity_max: + print('s > siminarity_max:', s.max(), s.min()) + return torch.tensor(0) # NOTE + return 1 / s + # average value 1/( ( 10* 1e-4/(1+1e-4) )**(1/2)+1e-3 ) = 30 + + def forward(self, obs: list, is_null: list) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = len(obs) + seq_length = len(obs[0]) + + # stack episode dim + obs = [torch.stack(episode_obs, dim=0) for episode_obs in obs] + + # stack batch dim + # way 0 + if isinstance(self.obs_shape, int): + obs = torch.stack(obs, dim=0).view(batch_size * seq_length, self.obs_shape).to(self.device) + else: # len(self.cfg.obs_shape) == 3 for image obs + obs = torch.stack(obs, dim=0).view(batch_size * seq_length, *self.obs_shape).to(self.device) + # way 2 + # obs = torch.cat(obs, 0) + + with torch.no_grad(): + cur_obs_embedding = self.embedding_net(obs) + cur_obs_embedding = cur_obs_embedding.view(batch_size, seq_length, -1) + episodic_reward = [[] for _ in range(batch_size)] + null_cnt = 0 # the number of null transitions in the whole minibatch + for i in range(batch_size): + for j in range(seq_length): + if j < 10: + # if self._running_mean_std_episodic_reward.mean is not None: + # episodic_reward[i].append(torch.tensor(self._running_mean_std_episodic_reward.mean).to(self.device)) + # else: + episodic_reward[i].append(torch.tensor(0.).to(self.device)) + elif j: + episodic_memory = cur_obs_embedding[i][:j] + reward = self._compute_intrinsic_reward(episodic_memory, + cur_obs_embedding[i][j]).to(self.device) + episodic_reward[i].append(reward) + + if torch.nonzero(torch.tensor(is_null[i]).float()).shape[0] != 0: + # TODO(pu): if have null padding, the episodic_reward should be 0 + null_start_index = int(torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1)[0]) + # add the number of null transitions in i'th sequence in batch + null_cnt = null_cnt + seq_length - null_start_index + for k in range(null_start_index, seq_length): + episodic_reward[i][k] = torch.tensor(0).to(self.device) + # episodic_reward[i][null_start_index:-1]=[torch.tensor(0).to(self.device) + # for i in range(seq_length-null_start_index)] + + # list(list(tensor)) -> tensor + tmp = [torch.stack(episodic_reward_tmp, dim=0) for episodic_reward_tmp in episodic_reward] + # stack batch dim + episodic_reward = torch.stack(tmp, dim=0) # TODO(pu): image case + episodic_reward = episodic_reward.view(-1) # torch.Size([32, 42]) -> torch.Size([32*42] + + episodic_reward_real_mean = sum(episodic_reward) / ( + batch_size * seq_length - null_cnt + ) # TODO(pu): recompute mean + + return episodic_reward, episodic_reward_real_mean + + def learn(self, inputs: Dict) -> torch.Tensor: + # obs: torch.Tensor, next_obs: torch.Tensor, action: torch.Tensor + cur_obs_embedding = self.embedding_net(inputs['obs']) + next_obs_embedding = self.embedding_net(inputs['next_obs']) + # get pred action + obs_plus_next_obs = torch.cat([cur_obs_embedding, next_obs_embedding], dim=-1) + pred_action_logits = self.inverse_net(obs_plus_next_obs) + loss = F.cross_entropy(pred_action_logits, inputs['action'].squeeze(-1)) + return loss diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index 5a8758bdb7..79320bd69f 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -1,18 +1,15 @@ import copy import random -from typing import Union, Tuple, Dict, List import numpy as np import torch -import torch.nn as nn -import torch.nn.functional as F import torch.optim as optim from easydict import EasyDict -from ding.model import FCEncoder, ConvEncoder from ding.utils import RunningMeanStd from ding.utils import SequenceType, REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel +from .network import RNDNetwork, InverseNetwork def collect_data_and_exclude_null_data_rnd(data_in): @@ -69,31 +66,6 @@ def collect_data_episodic(data_in): return res, is_null_list -class RndNetwork(nn.Module): - - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType) -> None: - super(RndNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.target = FCEncoder(obs_shape, hidden_size_list) - self.predictor = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.target = ConvEncoder(obs_shape, hidden_size_list) - self.predictor = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, " - "please customize your own RND model".format(obs_shape) - ) - for param in self.target.parameters(): - param.requires_grad = False - - def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - predict_feature = self.predictor(obs) - with torch.no_grad(): - target_feature = self.target(obs) - return predict_feature, target_feature - - @REWARD_MODEL_REGISTRY.register('rnd-ngu') class RndNGURewardModel(BaseRewardModel): r""" @@ -116,28 +88,30 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.reward_model = RndNetwork(config.obs_shape, config.hidden_size_list) + self.reward_model = RNDNetwork(config.obs_shape, config.hidden_size_list) self.reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] self.train_data_total = [] self.train_data = [] self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) + self.train_cnt_icm = 0 self.estimate_cnt_rnd = 0 self._running_mean_std_rnd = RunningMeanStd(epsilon=1e-4) self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd - def _train(self) -> None: + def _train(self) -> torch.Tensor: train_data: list = random.sample(list(self.train_data_cur), self.cfg.batch_size) train_data: torch.Tensor = torch.stack(train_data).to(self.device) - predict_feature, target_feature = self.reward_model(train_data) - loss = F.mse_loss(predict_feature, target_feature.detach()) + loss = self.reward_model.learn(train_data) self.opt.zero_grad() loss.backward() self.opt.step() + return loss + def train(self) -> None: if self.only_use_last_five_frames: # self.train_obs shape list(list) [batch_size,seq_length,N @@ -167,7 +141,9 @@ def train(self) -> None: # self.train_data = tmp for _ in range(self.cfg.update_per_collect): - self._train() + loss = self._train() + self.tb_logger.add_scalar('rnd_reward/loss', loss, self.train_cnt_icm) + self.train_cnt_icm += 1 def estimate(self, data: list) -> torch.Tensor: """ @@ -180,8 +156,7 @@ def estimate(self, data: list) -> torch.Tensor: obs = torch.stack(obs).to(self.device) with torch.no_grad(): - predict_feature, target_feature = self.reward_model(obs) - reward = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1) + reward = self.reward_model.forward(obs, norm=False) self._running_mean_std_rnd.update(reward.cpu().numpy()) # transform to mean 1 std 1 reward = 1 + (reward - self._running_mean_std_rnd.mean) / (self._running_mean_std_rnd.std + 1e-11) @@ -209,39 +184,6 @@ def reward_deepcopy(self, train_data): return train_data_reward_deepcopy -class InverseNetwork(nn.Module): - - def __init__(self, obs_shape: Union[int, SequenceType], action_shape, hidden_size_list: SequenceType) -> None: - super(InverseNetwork, self).__init__() - if isinstance(obs_shape, int) or len(obs_shape) == 1: - self.embedding_net = FCEncoder(obs_shape, hidden_size_list) - elif len(obs_shape) == 3: - self.embedding_net = ConvEncoder(obs_shape, hidden_size_list) - else: - raise KeyError( - "not support obs_shape for pre-defined encoder: {}, please customize your own RND model". - format(obs_shape) - ) - self.inverse_net = nn.Sequential( - nn.Linear(hidden_size_list[-1] * 2, 512), nn.ReLU(inplace=True), nn.Linear(512, action_shape) - ) - - def forward(self, inputs: Dict, inference: bool = False) -> Dict: - if inference: - with torch.no_grad(): - cur_obs_embedding = self.embedding_net(inputs['obs']) - return cur_obs_embedding - else: - # obs: torch.Tensor, next_obs: torch.Tensor - cur_obs_embedding = self.embedding_net(inputs['obs']) - next_obs_embedding = self.embedding_net(inputs['next_obs']) - # get pred action - obs_plus_next_obs = torch.cat([cur_obs_embedding, next_obs_embedding], dim=-1) - pred_action_logits = self.inverse_net(obs_plus_next_obs) - pred_action_probs = nn.Softmax(dim=-1)(pred_action_logits) - return pred_action_logits, pred_action_probs - - @REWARD_MODEL_REGISTRY.register('episodic') class EpisodicNGURewardModel(BaseRewardModel): r""" @@ -275,7 +217,9 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.episodic_reward_model = InverseNetwork(config.obs_shape, config.action_shape, config.hidden_size_list) + self.episodic_reward_model = InverseNetwork( + config.obs_shape, config.action_shape, config.hidden_size_list, self.device + ) self.episodic_reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] @@ -283,11 +227,10 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.train_action_total = [] self.opt = optim.Adam(self.episodic_reward_model.parameters(), config.learning_rate) self.estimate_cnt_episodic = 0 - self._running_mean_std_episodic_dist = RunningMeanStd(epsilon=1e-4) - self._running_mean_std_episodic_reward = RunningMeanStd(epsilon=1e-4) + self.train_cnt_episodic = 0 self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd - def _train(self) -> None: + def _train(self) -> torch.Tensor: # sample episode's timestep index train_index = np.random.randint(low=0, high=self.train_obs.shape[0], size=self.cfg.batch_size) @@ -295,14 +238,14 @@ def _train(self) -> None: train_next_obs: torch.Tensor = self.train_next_obs[train_index].to(self.device) train_action: torch.Tensor = self.train_action[train_index].to(self.device) - train_data = {'obs': train_obs, 'next_obs': train_next_obs} - pred_action_logits, pred_action_probs = self.episodic_reward_model(train_data) - - inverse_loss = F.cross_entropy(pred_action_logits, train_action.squeeze(-1)) + train_data = {'obs': train_obs, 'next_obs': train_next_obs, 'action': train_action} + inverse_loss = self.episodic_reward_model.learn(train_data) self.opt.zero_grad() inverse_loss.backward() self.opt.step() + return inverse_loss + def train(self) -> None: self.train_next_obs_total = copy.deepcopy(self.train_obs_total) @@ -332,32 +275,9 @@ def train(self) -> None: self.train_action = torch.cat(self.train_action, 0) for _ in range(self.cfg.update_per_collect): - self._train() - - def _compute_intrinsic_reward( - self, - episodic_memory: List, - current_controllable_state: torch.Tensor, - k=10, - kernel_cluster_distance=0.008, - kernel_epsilon=0.0001, - c=0.001, - siminarity_max=8, - ) -> torch.Tensor: - # this function is modified from https://github.com/Coac/never-give-up/blob/main/embedding_model.py - state_dist = torch.cdist(current_controllable_state.unsqueeze(0), episodic_memory, p=2).squeeze(0).sort()[0][:k] - self._running_mean_std_episodic_dist.update(state_dist.cpu().numpy()) - state_dist = state_dist / (self._running_mean_std_episodic_dist.mean + 1e-11) - - state_dist = torch.clamp(state_dist - kernel_cluster_distance, min=0, max=None) - kernel = kernel_epsilon / (state_dist + kernel_epsilon) - s = torch.sqrt(torch.clamp(torch.sum(kernel), min=0, max=None)) + c - - if s > siminarity_max: - print('s > siminarity_max:', s.max(), s.min()) - return torch.tensor(0) # NOTE - return 1 / s - # average value 1/( ( 10* 1e-4/(1+1e-4) )**(1/2)+1e-3 ) = 30 + loss = self._train() + self.tb_logger.add_scalar('episodic_reward/train_loss', loss, self.train_cnt_episodic) + self.train_cnt_episodic += 1 def estimate(self, data: list) -> torch.Tensor: """ @@ -365,63 +285,10 @@ def estimate(self, data: list) -> torch.Tensor: """ obs, is_null = collect_data_episodic(data) - # obs shape list(list()) [batch_size,seq_length,obs_dim] - batch_size = len(obs) - seq_length = len(obs[0]) - - # stack episode dim - obs = [torch.stack(episode_obs, dim=0) for episode_obs in obs] - # stack batch dim - # way 0 - if isinstance(self.cfg.obs_shape, int): - obs = torch.stack(obs, dim=0).view(batch_size * seq_length, self.cfg.obs_shape).to(self.device) - else: # len(self.cfg.obs_shape) == 3 for image obs - obs = torch.stack(obs, dim=0).view(batch_size * seq_length, *self.cfg.obs_shape).to(self.device) - # way 2 - # obs = torch.cat(obs, 0) - - inputs = {'obs': obs, 'is_null': is_null} with torch.no_grad(): - cur_obs_embedding = self.episodic_reward_model(inputs, inference=True) - cur_obs_embedding = cur_obs_embedding.view(batch_size, seq_length, -1) - episodic_reward = [[] for _ in range(batch_size)] - null_cnt = 0 # the number of null transitions in the whole minibatch - for i in range(batch_size): - for j in range(seq_length): - if j < 10: - # if self._running_mean_std_episodic_reward.mean is not None: - # episodic_reward[i].append(torch.tensor(self._running_mean_std_episodic_reward.mean).to(self.device)) - # else: - episodic_reward[i].append(torch.tensor(0.).to(self.device)) - elif j: - episodic_memory = cur_obs_embedding[i][:j] - reward = self._compute_intrinsic_reward(episodic_memory, - cur_obs_embedding[i][j]).to(self.device) - episodic_reward[i].append(reward) - - if torch.nonzero(torch.tensor(is_null[i]).float()).shape[0] != 0: - # TODO(pu): if have null padding, the episodic_reward should be 0 - not_null_index = torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1) - null_start_index = int(torch.nonzero(torch.tensor(is_null[i]).float()).squeeze(-1)[0]) - # add the number of null transitions in i'th sequence in batch - null_cnt = null_cnt + seq_length - null_start_index - for k in range(null_start_index, seq_length): - episodic_reward[i][k] = torch.tensor(0).to(self.device) - # episodic_reward[i][null_start_index:-1]=[torch.tensor(0).to(self.device) - # for i in range(seq_length-null_start_index)] - - # list(list(tensor)) -> tensor - tmp = [torch.stack(episodic_reward_tmp, dim=0) for episodic_reward_tmp in episodic_reward] - # stack batch dim - episodic_reward = torch.stack(tmp, dim=0) # TODO(pu): image case - episodic_reward = episodic_reward.view(-1) # torch.Size([32, 42]) -> torch.Size([32*42] - - episodic_reward_real_mean = sum(episodic_reward) / ( - batch_size * seq_length - null_cnt - ) # TODO(pu): recompute mean + episodic_reward, episodic_reward_real_mean = self.episodic_reward_model.forward(obs, is_null) self.estimate_cnt_episodic += 1 - self._running_mean_std_episodic_reward.update(episodic_reward.cpu().numpy()) self.tb_logger.add_scalar( 'episodic_reward/episodic_reward_max', episodic_reward.max(), self.estimate_cnt_episodic diff --git a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py index 8315e934fe..b1975718f3 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_bc_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_bc_config.py @@ -20,7 +20,7 @@ batch_size=64, learning_rate=0.01, learner=dict(hook=dict(save_ckpt_after_iter=1000)), - train_epoch = 20, + train_epoch=20, ), eval=dict(evaluator=dict(eval_freq=40, )) ), diff --git a/dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py b/dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py new file mode 100644 index 0000000000..1eca4cd0bc --- /dev/null +++ b/dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py @@ -0,0 +1,126 @@ +from easydict import EasyDict + +collector_env_num = 8 +evaluator_env_num = 8 +nstep = 5 +cartpole_ppo_rnd_config = dict( + exp_name='cartpole_ngu_seed0', + env=dict( + collector_env_num=collector_env_num, + evaluator_env_num=evaluator_env_num, + n_evaluator_episode=evaluator_env_num, + #'ALE/Pong-v5' is available. But special setting is needed after gym make. + obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy + stop_value=195, + ), + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=4, + action_shape=2, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=4, + action_shape=2, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), + policy=dict( + cuda=True, + on_policy=False, + priority=True, + priority_IS_weight=True, + discount_factor=0.997, + nstep=nstep, + burnin_step=2, + # (int) is the total length of [sequence sample] minus + # the length of burnin part in [sequence sample], + # i.e., = = + + learn_unroll_len=40, # set this key according to the episode length + model=dict( + obs_shape=4, + action_shape=2, + encoder_hidden_size_list=[128, 128, 512], + collector_env_num=collector_env_num, + ), + learn=dict( + update_per_collect=8, + batch_size=64, + learning_rate=0.0005, + target_update_theta=0.001, + ), + collect=dict( + # NOTE: It is important that set key traj_len_inf=True here, + # to make sure self._traj_len=INF in serial_sample_collector.py. + # In sequence-based policy, for each collect_env, + # we want to collect data of length self._traj_len=INF + # unless the episode enters the 'done' state. + # In each collect phase, we collect a total of sequence samples. + n_sample=32, + traj_len_inf=True, + env_num=collector_env_num, + ), + eval=dict(env_num=evaluator_env_num, ), + other=dict( + eps=dict( + type='exp', + start=0.95, + end=0.05, + decay=1e5, + ), + replay_buffer=dict( + replay_buffer_size=int(2e4), + # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization + alpha=0.6, + # (Float type) How much correction is used: 0 means no correction while 1 means full correction + beta=0.4, + ) + ), + ), +) +cartpole_ppo_rnd_config = EasyDict(cartpole_ppo_rnd_config) +main_config = cartpole_ppo_rnd_config +cartpole_ppo_rnd_create_config = dict( + env=dict( + type='cartpole', + import_names=['dizoo.classic_control.cartpole.envs.cartpole_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='ngu'), + rnd_reward_model=dict(type='rnd-ngu'), + episodic_reward_model=dict(type='episodic'), +) +cartpole_ppo_rnd_create_config = EasyDict(cartpole_ppo_rnd_create_config) +create_config = cartpole_ppo_rnd_create_config + +if __name__ == "__main__": + # or you can enter `ding -m serial_ngu -c pong_ngu_config.py -s 0` + from ding.entry import serial_pipeline_ngu + serial_pipeline_ngu([main_config, create_config], seed=0) From eddc80d05743aafb6c03c0d181922f8a2c94e801 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 26 Apr 2023 01:38:23 -0400 Subject: [PATCH 19/52] change reward entry --- ding/entry/serial_entry_reward_model_offpolicy.py | 9 +++++++-- ding/entry/serial_entry_reward_model_onpolicy.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/ding/entry/serial_entry_reward_model_offpolicy.py b/ding/entry/serial_entry_reward_model_offpolicy.py index bcfd0f882e..84dd47f462 100644 --- a/ding/entry/serial_entry_reward_model_offpolicy.py +++ b/ding/entry/serial_entry_reward_model_offpolicy.py @@ -24,6 +24,8 @@ def serial_pipeline_reward_model_offpolicy( model: Optional[torch.nn.Module] = None, max_train_iter: Optional[int] = int(1e10), max_env_step: Optional[int] = int(1e10), + cooptrain_reward: Optional[bool] = True, + pretrain_reward: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -78,6 +80,8 @@ def serial_pipeline_reward_model_offpolicy( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) + if pretrain_reward: + reward_model.train() # ========== # Main loop @@ -108,8 +112,9 @@ def serial_pipeline_reward_model_offpolicy( # collect data for reward_model training reward_model.collect_data(new_data) replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - # update reward_model - reward_model.train() + # update reward_model, when you want to train reward_model inloop + if cooptrain_reward: + reward_model.train() # clear buffer per fix iters to make sure replay buffer's data count isn't too few. if count % cfg.reward_model.clear_buffer_per_iters == 0: reward_model.clear_data() diff --git a/ding/entry/serial_entry_reward_model_onpolicy.py b/ding/entry/serial_entry_reward_model_onpolicy.py index a30e5d8ef9..b5cf05f8d0 100644 --- a/ding/entry/serial_entry_reward_model_onpolicy.py +++ b/ding/entry/serial_entry_reward_model_onpolicy.py @@ -24,6 +24,8 @@ def serial_pipeline_reward_model_onpolicy( model: Optional[torch.nn.Module] = None, max_train_iter: Optional[int] = int(1e10), max_env_step: Optional[int] = int(1e10), + cooptrain_reward: Optional[bool] = True, + pretrain_reward: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -78,7 +80,8 @@ def serial_pipeline_reward_model_onpolicy( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - + if pretrain_reward: + reward_model.train() # ========== # Main loop # ========== @@ -106,9 +109,11 @@ def serial_pipeline_reward_model_onpolicy( new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) new_data_count += len(new_data) # collect data for reward_model training - reward_model.collect_data(new_data) + if cooptrain_reward: + reward_model.collect_data(new_data) # update reward_model - reward_model.train() + if cooptrain_reward: + reward_model.train() if count % cfg.reward_model.clear_buffer_per_iters == 0: reward_model.clear_data() # Learn policy from collected data From 6e2b867746f8f4553c7d0c4e5a5058cf6c2dc609 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 26 Apr 2023 06:23:16 -0400 Subject: [PATCH 20/52] change trex entry to new entry, combine old trex test to new test --- .../serial_entry_reward_model_offpolicy.py | 18 +++---- .../serial_entry_reward_model_onpolicy.py | 18 +++---- .../test_serial_entry_preference_based_irl.py | 39 -------------- ...ial_entry_preference_based_irl_onpolicy.py | 38 ------------- .../tests/test_serial_entry_reward_model.py | 53 ++++++++++++++----- ding/reward_model/trex_reward_model.py | 20 ++++--- .../config/cartpole_trex_offppo_config.py | 5 +- .../config/cartpole_trex_onppo_config.py | 5 +- 8 files changed, 76 insertions(+), 120 deletions(-) delete mode 100644 ding/entry/tests/test_serial_entry_preference_based_irl.py delete mode 100644 ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py diff --git a/ding/entry/serial_entry_reward_model_offpolicy.py b/ding/entry/serial_entry_reward_model_offpolicy.py index 84dd47f462..d3f6c59390 100644 --- a/ding/entry/serial_entry_reward_model_offpolicy.py +++ b/ding/entry/serial_entry_reward_model_offpolicy.py @@ -18,14 +18,14 @@ def serial_pipeline_reward_model_offpolicy( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), - cooptrain_reward: Optional[bool] = True, - pretrain_reward: Optional[bool] = False, + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + env_setting: Optional[List[Any]] = None, + model: Optional[torch.nn.Module] = None, + max_train_iter: Optional[int] = int(1e10), + max_env_step: Optional[int] = int(1e10), + cooptrain_reward: Optional[bool] = True, + pretrain_reward: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -116,7 +116,7 @@ def serial_pipeline_reward_model_offpolicy( if cooptrain_reward: reward_model.train() # clear buffer per fix iters to make sure replay buffer's data count isn't too few. - if count % cfg.reward_model.clear_buffer_per_iters == 0: + if hasattr(cfg.reward_model, 'clear_buffer_per_iters') and count % cfg.reward_model.clear_buffer_per_iters == 0: reward_model.clear_data() # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): diff --git a/ding/entry/serial_entry_reward_model_onpolicy.py b/ding/entry/serial_entry_reward_model_onpolicy.py index b5cf05f8d0..f67e230db4 100644 --- a/ding/entry/serial_entry_reward_model_onpolicy.py +++ b/ding/entry/serial_entry_reward_model_onpolicy.py @@ -18,14 +18,14 @@ def serial_pipeline_reward_model_onpolicy( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), - cooptrain_reward: Optional[bool] = True, - pretrain_reward: Optional[bool] = False, + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + env_setting: Optional[List[Any]] = None, + model: Optional[torch.nn.Module] = None, + max_train_iter: Optional[int] = int(1e10), + max_env_step: Optional[int] = int(1e10), + cooptrain_reward: Optional[bool] = True, + pretrain_reward: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -114,7 +114,7 @@ def serial_pipeline_reward_model_onpolicy( # update reward_model if cooptrain_reward: reward_model.train() - if count % cfg.reward_model.clear_buffer_per_iters == 0: + if hasattr(cfg.reward_model, 'clear_buffer_per_iters') and count % cfg.reward_model.clear_buffer_per_iters == 0: reward_model.clear_data() # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): diff --git a/ding/entry/tests/test_serial_entry_preference_based_irl.py b/ding/entry/tests/test_serial_entry_preference_based_irl.py deleted file mode 100644 index 3b577ce916..0000000000 --- a/ding/entry/tests/test_serial_entry_preference_based_irl.py +++ /dev/null @@ -1,39 +0,0 @@ -import pytest -from copy import deepcopy -import os -from easydict import EasyDict - -import torch - -from ding.entry import serial_pipeline -from ding.entry import serial_pipeline_preference_based_irl -from dizoo.classic_control.cartpole.config.cartpole_trex_offppo_config import cartpole_trex_offppo_config,\ - cartpole_trex_offppo_create_config -from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config,\ - cartpole_ppo_offpolicy_create_config -from ding.entry.application_entry_trex_collect_data import trex_collecting_data - - -@pytest.mark.unittest -def test_serial_pipeline_trex(): - exp_name = 'test_serial_pipeline_trex_expert' - config = [deepcopy(cartpole_ppo_offpolicy_config), deepcopy(cartpole_ppo_offpolicy_create_config)] - config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 - config[0].exp_name = exp_name - expert_policy = serial_pipeline(config, seed=0) - - exp_name = 'test_serial_pipeline_trex_collect' - config = [deepcopy(cartpole_trex_offppo_config), deepcopy(cartpole_trex_offppo_create_config)] - config[0].exp_name = exp_name - config[0].reward_model.expert_model_path = 'test_serial_pipeline_trex_expert' - config[0].reward_model.checkpoint_max = 100 - config[0].reward_model.checkpoint_step = 100 - config[0].reward_model.num_snippets = 100 - args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) - trex_collecting_data(args=args) - try: - serial_pipeline_preference_based_irl(config, seed=0, max_train_iter=1) - except Exception: - assert False, "pipeline fail" - finally: - os.popen('rm -rf test_serial_pipeline_trex*') diff --git a/ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py b/ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py deleted file mode 100644 index ff0e88b0d5..0000000000 --- a/ding/entry/tests/test_serial_entry_preference_based_irl_onpolicy.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest -from copy import deepcopy -import os -from easydict import EasyDict - -import torch - -from ding.entry import serial_pipeline_onpolicy -from ding.entry import serial_pipeline_preference_based_irl_onpolicy -from dizoo.classic_control.cartpole.config import cartpole_ppo_config, cartpole_ppo_create_config -from dizoo.classic_control.cartpole.config import cartpole_trex_ppo_onpolicy_config, \ - cartpole_trex_ppo_onpolicy_create_config -from ding.entry.application_entry_trex_collect_data import trex_collecting_data - - -@pytest.mark.unittest -def test_serial_pipeline_trex_onpolicy(): - exp_name = 'test_serial_pipeline_trex_onpolicy_expert' - config = [deepcopy(cartpole_ppo_config), deepcopy(cartpole_ppo_create_config)] - config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 - config[0].exp_name = exp_name - expert_policy = serial_pipeline_onpolicy(config, seed=0) - - exp_name = 'test_serial_pipeline_trex_onpolicy_collect' - config = [deepcopy(cartpole_trex_ppo_onpolicy_config), deepcopy(cartpole_trex_ppo_onpolicy_create_config)] - config[0].exp_name = exp_name - config[0].reward_model.expert_model_path = 'test_serial_pipeline_trex_onpolicy_expert' - config[0].reward_model.checkpoint_max = 100 - config[0].reward_model.checkpoint_step = 100 - config[0].reward_model.num_snippets = 100 - args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) - trex_collecting_data(args=args) - try: - serial_pipeline_preference_based_irl_onpolicy(config, seed=0, max_train_iter=1) - except Exception: - assert False, "pipeline fail" - finally: - os.popen('rm -rf test_serial_pipeline_trex_onpolicy*') diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index de07b67d31..7fb707431f 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -5,31 +5,31 @@ from copy import deepcopy from dizoo.classic_control.cartpole.config.cartpole_dqn_config import cartpole_dqn_config, cartpole_dqn_create_config +from dizoo.classic_control.cartpole.config.cartpole_trex_offppo_config import cartpole_trex_offppo_config,\ + cartpole_trex_offppo_create_config from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_rnd_onppo_config import cartpole_ppo_rnd_config, cartpole_ppo_rnd_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_ppo_icm_config import cartpole_ppo_icm_config, cartpole_ppo_icm_create_config # noqa from ding.entry import serial_pipeline, collect_demo_data, serial_pipeline_reward_model_offpolicy, \ serial_pipeline_reward_model_onpolicy +from ding.entry.application_entry_trex_collect_data import trex_collecting_data cfg = [ { 'type': 'pdeil', "alpha": 0.5, "discrete_action": False - }, - { + }, { 'type': 'gail', 'input_size': 5, 'hidden_size_list': [64], 'batch_size': 64, - }, - { + }, { 'type': 'pwil', 's_size': 4, 'a_size': 2, 'sample_size': 500, - }, - { + }, { 'type': 'red', 'sample_size': 5000, 'obs_shape': 4, @@ -37,7 +37,22 @@ 'hidden_size_list': [64, 1], 'update_per_collect': 200, 'batch_size': 128, - }, + }, { + 'type': 'trex', + 'exp_name': 'cartpole_trex_offppo_seed0', + 'min_snippet_length': 5, + 'max_snippet_length': 100, + 'checkpoint_min': 0, + 'checkpoint_max': 6, + 'checkpoint_step': 6, + 'learning_rate': 1e-5, + 'update_per_collect': 1, + 'expert_model_path': 'cartpole_ppo_offpolicy_seed0', + 'data_path': 'abs data path', + 'hidden_size_list': [512, 64, 1], + 'obs_shape': 4, + 'action_shape': 2, + } ] @@ -52,9 +67,15 @@ def test_irl(reward_model_config): expert_data_path = 'expert_data.pkl' state_dict = expert_policy.collect_mode.state_dict() config = deepcopy(cartpole_ppo_offpolicy_config), deepcopy(cartpole_ppo_offpolicy_create_config) - collect_demo_data( - config, seed=0, state_dict=state_dict, expert_data_path=expert_data_path, collect_count=collect_count - ) + if reward_model_config.type == 'trex': + trex_config = [deepcopy(cartpole_trex_offppo_config), deepcopy(cartpole_trex_offppo_create_config)] + trex_config[0].reward_model = reward_model_config + args = EasyDict({'cfg': deepcopy(trex_config), 'seed': 0, 'device': 'cpu'}) + trex_collecting_data(args=args) + else: + collect_demo_data( + config, seed=0, state_dict=state_dict, expert_data_path=expert_data_path, collect_count=collect_count + ) # irl + rl training cp_cartpole_dqn_config = deepcopy(cartpole_dqn_config) cp_cartpole_dqn_create_config = deepcopy(cartpole_dqn_create_config) @@ -65,10 +86,18 @@ def test_irl(reward_model_config): reward_model_config['expert_data_path'] = expert_data_path cp_cartpole_dqn_config.reward_model = reward_model_config cp_cartpole_dqn_config.policy.collect.n_sample = 128 + cooptrain_reward = True + pretrain_reward = False + if reward_model_config.type == 'trex': + cooptrain_reward = False + pretrain_reward = True serial_pipeline_reward_model_offpolicy( - (cp_cartpole_dqn_config, cp_cartpole_dqn_create_config), seed=0, max_train_iter=2 + (cp_cartpole_dqn_config, cp_cartpole_dqn_create_config), + seed=0, + max_train_iter=2, + pretrain_reward=pretrain_reward, + cooptrain_reward=cooptrain_reward ) - os.popen("rm -rf ckpt_* log expert_data.pkl") diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 2d32074b49..87cfce49da 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -71,26 +71,24 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device in ["cpu", "cuda"] or "cuda" in device self.device = device self.tb_logger = tb_logger - kernel_size = config.reward_model.kernel_size if 'kernel_size' in config else None - stride = config.reward_model.stride if 'stride' in config else None - self.reward_model = TREXNetwork( - self.cfg.policy.model.obs_shape, config.reward_model.hidden_size_list, kernel_size, stride - ) + kernel_size = config.kernel_size if 'kernel_size' in config else None + stride = config.stride if 'stride' in config else None + self.reward_model = TREXNetwork(self.cfg.obs_shape, config.hidden_size_list, kernel_size, stride) self.reward_model.to(self.device) self.pre_expert_data = [] self.train_data = [] self.expert_data_loader = None - self.opt = optim.Adam(self.reward_model.parameters(), config.reward_model.learning_rate) + self.opt = optim.Adam(self.reward_model.parameters(), config.learning_rate) self.train_iter = 0 self.learning_returns = [] self.training_obs = [] self.training_labels = [] - self.num_trajs = self.cfg.reward_model.num_trajs - self.num_snippets = self.cfg.reward_model.num_snippets + self.num_trajs = self.cfg.num_trajs + self.num_snippets = self.cfg.num_snippets # minimum number of short subtrajectories to sample - self.min_snippet_length = config.reward_model.min_snippet_length + self.min_snippet_length = config.min_snippet_length # maximum number of short subtrajectories to sample - self.max_snippet_length = config.reward_model.max_snippet_length + self.max_snippet_length = config.max_snippet_length self.l1_reg = 0 self.data_for_save = {} self._logger, self._tb_logger = build_logger( @@ -227,7 +225,7 @@ def train(self): cum_loss = 0.0 training_data = list(zip(training_inputs, training_outputs)) - for epoch in range(self.cfg.reward_model.update_per_collect): + for epoch in range(self.cfg.update_per_collect): np.random.shuffle(training_data) training_obs, training_labels = zip(*training_data) cum_loss = self._train(training_obs, training_labels) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index 12db6d7e82..b55d4ae391 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -10,6 +10,7 @@ ), reward_model=dict( type='trex', + exp_name='cartpole_trex_offppo_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, @@ -20,7 +21,9 @@ expert_model_path='abs model path', reward_model_path='abs data path + ./cartpole.params', data_path='abs data path', - hidden_size_list=[512, 64, 1] + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 14ada190a6..f35893c584 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -10,6 +10,7 @@ ), reward_model=dict( type='trex', + exp_name='cartpole_trex_onppo_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, @@ -20,7 +21,9 @@ expert_model_path='abs model path', reward_model_path='abs data path + ./cartpole.params', data_path='abs data path', - hidden_size_list=[512, 64, 1] + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, From 97634dc3d6650f91b1d4f2323a364fb0e03e754b Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 27 Apr 2023 23:02:35 -0400 Subject: [PATCH 21/52] refactor trex config file --- ding/entry/tests/test_serial_entry_reward_model.py | 1 - ding/reward_model/trex_reward_model.py | 7 +++++++ .../config/serial/pong/pong_trex_offppo_config.py | 10 +++------- .../atari/config/serial/pong/pong_trex_sql_config.py | 11 ++++------- .../config/serial/qbert/qbert_trex_dqn_config.py | 6 ++++-- .../config/serial/qbert/qbert_trex_offppo_config.py | 6 ++++-- .../cartpole/config/cartpole_trex_dqn_config.py | 3 +++ .../cartpole/config/cartpole_trex_offppo_config.py | 2 -- .../cartpole/config/cartpole_trex_onppo_config.py | 2 -- dizoo/mujoco/config/halfcheetah_trex_onppo_config.py | 12 +++++------- dizoo/mujoco/config/halfcheetah_trex_sac_config.py | 12 +++++------- dizoo/mujoco/config/hopper_trex_onppo_config.py | 12 +++++------- dizoo/mujoco/config/hopper_trex_sac_config.py | 12 +++++------- 13 files changed, 45 insertions(+), 51 deletions(-) diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 7fb707431f..89c1bf49ff 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -48,7 +48,6 @@ 'learning_rate': 1e-5, 'update_per_collect': 1, 'expert_model_path': 'cartpole_ppo_offpolicy_seed0', - 'data_path': 'abs data path', 'hidden_size_list': [512, 64, 1], 'obs_shape': 4, 'action_shape': 2, diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 87cfce49da..25c700a40a 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -80,6 +80,7 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.expert_data_loader = None self.opt = optim.Adam(self.reward_model.parameters(), config.learning_rate) self.train_iter = 0 + self.estimate_iter = 0 self.learning_returns = [] self.training_obs = [] self.training_labels = [] @@ -229,7 +230,9 @@ def train(self): np.random.shuffle(training_data) training_obs, training_labels = zip(*training_data) cum_loss = self._train(training_obs, training_labels) + self.train_iter += 1 logging.info("[epoch {}] loss {}".format(epoch, cum_loss)) + self.tb_logger.add_scalar("trex_reward/train_loss_iteration", cum_loss, self.train_iter) # print out predicted cumulative returns and actual returns sorted_returns = sorted(self.learning_returns, key=lambda s: s[0]) demonstrations = [ @@ -303,6 +306,10 @@ def estimate(self, data: list) -> List[Dict]: res = torch.stack(res).to(self.device) with torch.no_grad(): sum_rewards = self.reward_model.forward(res) + self.tb_logger.add_scalar("trex_reward/estimate_reward_mean", sum_rewards.mean().item(), self.train_iter) + self.tb_logger.add_scalar("trex_reward/estimate_reward_std", sum_rewards.std().item(), self.train_iter) + self.tb_logger.add_scalar("trex_reward/estimate_reward_max", sum_rewards.max().item(), self.train_iter) + self.tb_logger.add_scalar("trex_reward/estimate_reward_min", sum_rewards.min().item(), self.train_iter) for item, rew in zip(train_data_augmented, sum_rewards): # TODO optimise this loop as well ? item['reward'] = rew diff --git a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py index 743fa0571a..c4adb1c65b 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py @@ -25,13 +25,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /pong.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, diff --git a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py index a1d89af0d7..a9978ab324 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='pong_trex_sql_seed0', min_snippet_length=50, max_snippet_length=100, checkpoint_min=10000, @@ -25,13 +26,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /pong.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=False, diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py index 2a038e37e6..3517d952f0 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='qbert_trex_dqn_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=0, @@ -21,8 +22,9 @@ learning_rate=1e-5, update_per_collect=1, expert_model_path='abs model path', - reward_model_path='abs data path + ./qbert.params', - offline_data_path='abs data path', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py index 8b76e2f728..3d90da92d9 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='qbert_trex_offppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=0, @@ -21,8 +22,9 @@ learning_rate=1e-5, update_per_collect=1, expert_model_path='abs model path', - reward_model_path='abs data path + ./qbert.params', - offline_data_path='abs data path', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py index 306cadd6f2..bd65035c48 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py @@ -20,6 +20,9 @@ num_trajs=6, num_snippets=6000, expert_model_path='cartpole_dqn_seed0', # expert model experiment directory path + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index b55d4ae391..6cd7a6f227 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -19,8 +19,6 @@ learning_rate=1e-5, update_per_collect=1, expert_model_path='abs model path', - reward_model_path='abs data path + ./cartpole.params', - data_path='abs data path', hidden_size_list=[512, 64, 1], obs_shape=4, action_shape=2, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index f35893c584..8989c902c1 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -19,8 +19,6 @@ learning_rate=1e-5, update_per_collect=1, expert_model_path='abs model path', - reward_model_path='abs data path + ./cartpole.params', - data_path='abs data path', hidden_size_list=[512, 64, 1], obs_shape=4, action_shape=2, diff --git a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py index 6d635c212d..7682f60c88 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py @@ -12,6 +12,8 @@ stop_value=3000, ), reward_model=dict( + type='trex', + exp_name='halfcheetah_trex_onppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=10000, @@ -25,13 +27,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /HalfCheetah.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, diff --git a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py index 5f123682a0..9a31e1c493 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py @@ -12,6 +12,8 @@ stop_value=12000, ), reward_model=dict( + type='trex', + exp_name='halfcheetah_trex_sac_seed0', learning_rate=1e-5, min_snippet_length=30, max_snippet_length=100, @@ -24,13 +26,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /HalfCheetah.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, diff --git a/dizoo/mujoco/config/hopper_trex_onppo_config.py b/dizoo/mujoco/config/hopper_trex_onppo_config.py index e69451fe3c..c14d29d9fd 100644 --- a/dizoo/mujoco/config/hopper_trex_onppo_config.py +++ b/dizoo/mujoco/config/hopper_trex_onppo_config.py @@ -12,6 +12,8 @@ stop_value=3000, ), reward_model=dict( + type='trex', + exp_name='hopper_trex_onppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=10000, @@ -25,13 +27,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Hopper.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=11, + action_shape=3, ), policy=dict( cuda=True, diff --git a/dizoo/mujoco/config/hopper_trex_sac_config.py b/dizoo/mujoco/config/hopper_trex_sac_config.py index 5c4aa6f2c1..e99d0636d8 100644 --- a/dizoo/mujoco/config/hopper_trex_sac_config.py +++ b/dizoo/mujoco/config/hopper_trex_sac_config.py @@ -12,6 +12,8 @@ stop_value=6000, ), reward_model=dict( + type='trex', + exp_name='hopper_trex_sac_seed0', learning_rate=1e-5, min_snippet_length=30, max_snippet_length=100, @@ -24,13 +26,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Hopper.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=11, + action_shape=3, ), policy=dict( cuda=True, From f099cacb1e96ddcc9580014330a7d15cd0d8bbd2 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 27 Apr 2023 23:32:35 -0400 Subject: [PATCH 22/52] refactor trex config file --- .../lunarlander/config/lunarlander_trex_dqn_config.py | 11 ++++------- .../config/lunarlander_trex_offppo_config.py | 11 ++++------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py index 790ca5c271..99b2c4111e 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py @@ -14,6 +14,7 @@ ), reward_model=dict( type='trex', + exp_name='lunarlander_trex_dqn_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=1000, @@ -26,13 +27,9 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /lunarlander.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # e.g. 'exp_name/expert_data.pkl' - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=8, + action_shape=4, ), policy=dict( # Whether to use cuda for network. diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py index 37e2c78fdd..4aa5bc4716 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py @@ -11,6 +11,7 @@ ), reward_model=dict( type='trex', + exp_name='lunarlander_trex_offppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=1000, @@ -23,13 +24,9 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /lunarlander.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + hidden_size_list=[512, 64, 1], + obs_shape=8, + action_shape=4, ), policy=dict( cuda=True, From 0c48c08ebece796ed7b4d25e9c1f534573590bc7 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 28 Apr 2023 02:09:38 -0400 Subject: [PATCH 23/52] refactor trex config file --- .../serial/pong/pong_trex_offppo_config.py | 8 ++++--- .../serial/pong/pong_trex_sql_config.py | 7 ++++--- .../serial/qbert/qbert_trex_dqn_config.py | 7 ++++--- .../serial/qbert/qbert_trex_offppo_config.py | 7 ++++--- .../spaceinvaders_trex_dqn_config.py | 21 +++++++------------ .../spaceinvaders_trex_offppo_config.py | 21 +++++++------------ .../config/lunarlander_trex_dqn_config.py | 7 ++++--- .../config/lunarlander_trex_offppo_config.py | 6 +++--- .../config/cartpole_trex_dqn_config.py | 8 +++++-- .../config/cartpole_trex_offppo_config.py | 8 ++++--- .../config/cartpole_trex_onppo_config.py | 9 +++++--- .../config/halfcheetah_trex_onppo_config.py | 6 +++--- .../config/halfcheetah_trex_sac_config.py | 6 +++--- .../mujoco/config/hopper_trex_onppo_config.py | 7 ++++--- dizoo/mujoco/config/hopper_trex_sac_config.py | 7 ++++--- .../config/walker2d_trex_onppo_config.py | 18 +++++++--------- .../mujoco/config/walker2d_trex_sac_config.py | 18 +++++++--------- 17 files changed, 87 insertions(+), 84 deletions(-) diff --git a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py index c4adb1c65b..5096202a4d 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py @@ -13,6 +13,7 @@ ), reward_model=dict( type='trex', + exp_name='pong_trex_offppo_seed0', min_snippet_length=50, max_snippet_length=100, checkpoint_min=0, @@ -24,7 +25,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='pong_ppo_seed0', hidden_size_list=[512, 64, 1], obs_shape=[4, 84, 84], action_shape=6, @@ -76,6 +77,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo_offpolicy'), + reward_model=dict(type='trex'), ) pong_trex_ppo_create_config = EasyDict(pong_trex_ppo_create_config) create_config = pong_trex_ppo_create_config @@ -87,7 +89,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_preference_based_irl + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -95,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_preference_based_irl((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py index a9978ab324..26b8e59d94 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py @@ -25,7 +25,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='pong_sql_seed0', hidden_size_list=[512, 64, 1], obs_shape=[4, 84, 84], action_shape=6, @@ -62,6 +62,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='sql'), + reward_model=dict(type='trex'), ) pong_trex_sql_create_config = EasyDict(pong_trex_sql_create_config) create_config = pong_trex_sql_create_config @@ -73,7 +74,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_preference_based_irl + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -81,4 +82,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_preference_based_irl((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py index 3517d952f0..f4d6467d01 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py @@ -21,7 +21,7 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', + expert_model_path='qbert_dqn_seed0', hidden_size_list=[512, 64, 1], obs_shape=[4, 84, 84], action_shape=6, @@ -64,6 +64,7 @@ ), env_manager=dict(type='base'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) qbert_trex_dqn_create_config = EasyDict(qbert_trex_dqn_create_config) create_config = qbert_trex_dqn_create_config @@ -76,7 +77,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') @@ -85,4 +86,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py index 3d90da92d9..567d1370fe 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py @@ -21,7 +21,7 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', + expert_model_path='qbert_ppo_seed0', hidden_size_list=[512, 64, 1], obs_shape=[4, 84, 84], action_shape=6, @@ -71,6 +71,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo_offpolicy'), + reward_model=dict(type='trex'), ) create_config = EasyDict(qbert_trex_ppo_create_config) @@ -82,7 +83,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') @@ -91,4 +92,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py index 0d40d1e26c..a36c2c5579 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py @@ -15,6 +15,7 @@ ), reward_model=dict( type='trex', + exp_name='spaceinvaders_trex_dqn_seed0', min_snippet_length=50, max_snippet_length=100, checkpoint_min=10000, @@ -28,17 +29,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name``. # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn`` - expert_model_path='model_path_placeholder', - # path to save reward model - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory. - reward_model_path='model_path_placeholder + ./spaceinvaders.params', - # path to save generated observations. - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory. - offline_data_path='data_path_placeholder', + expert_model_path='spaceinvaders_dqn_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -78,6 +72,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) spaceinvaders_trex_dqn_create_config = EasyDict(spaceinvaders_trex_dqn_create_config) create_config = spaceinvaders_trex_dqn_create_config @@ -89,7 +84,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -97,4 +92,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py index 4b75d7289a..e387f6e5aa 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py @@ -15,6 +15,7 @@ ), reward_model=dict( type='trex', + exp_name='spaceinvaders_trex_offppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=0, @@ -27,17 +28,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name``. # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn`` - expert_model_path='model_path_placeholder', - # path to save reward model - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory. - reward_model_path='model_path_placeholder + ./spaceinvaders.params', - # path to save generated observations. - # Users should add their own model path here. - # Absolute path is recommended. - # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory. - offline_data_path='data_path_placeholder', + expert_model_path='spaceinvaders_ppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=[4, 84, 84], + action_shape=6, ), policy=dict( cuda=True, @@ -85,6 +79,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo_offpolicy'), + reward_model=dict(type='trex'), ) spaceinvaders_trex_ppo_create_config = EasyDict(spaceinvaders_trex_ppo_create_config) create_config = spaceinvaders_trex_ppo_create_config @@ -96,7 +91,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -104,4 +99,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py index 99b2c4111e..86e0a6f3d5 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py @@ -26,7 +26,7 @@ # Users should add their own model path here. Model path should lead to a model. # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - expert_model_path='model_path_placeholder', + expert_model_path='lunarlander_dqn_seed0', hidden_size_list=[512, 64, 1], obs_shape=8, action_shape=4, @@ -85,6 +85,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) lunarlander_trex_dqn_create_config = EasyDict(lunarlander_trex_dqn_create_config) create_config = lunarlander_trex_dqn_create_config @@ -96,7 +97,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -104,4 +105,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py index 4aa5bc4716..88f66fef9f 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py @@ -23,7 +23,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='lunarlander_offppo_seed0', hidden_size_list=[512, 64, 1], obs_shape=8, action_shape=4, @@ -73,7 +73,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -81,4 +81,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py index bd65035c48..c2b1cf1943 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py @@ -10,6 +10,7 @@ ), reward_model=dict( type='trex', + exp_name='cartpole_trex_dqn_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, @@ -61,6 +62,7 @@ ), env_manager=dict(type='base'), policy=dict(type='dqn'), + reward_model=dict(type='trex'), ) cartpole_trex_dqn_create_config = EasyDict(cartpole_trex_dqn_create_config) create_config = cartpole_trex_dqn_create_config @@ -69,10 +71,12 @@ # Users should first run ``cartpole_dqn_config.py`` to save models (or checkpoints). # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + # example of running this file: + # python cartpole_trex_dqn_config.py --cfg cartpole_trex_dqn_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -80,4 +84,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config),pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index 6cd7a6f227..b4d62b9008 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -18,7 +18,7 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', + expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path hidden_size_list=[512, 64, 1], obs_shape=4, action_shape=2, @@ -68,10 +68,12 @@ # Users should first run ``cartpole_offppo_config.py`` to save models (or checkpoints). # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + # example: + # python cartpole_trex_offppo_config.py --cfg cartpole_trex_offppo_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -79,4 +81,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex((main_config, create_config)) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 8989c902c1..30608a9053 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -18,7 +18,7 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, - expert_model_path='abs model path', + expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path hidden_size_list=[512, 64, 1], obs_shape=4, action_shape=2, @@ -69,10 +69,12 @@ # Users should first run ``cartpole_onppo_config.py`` to save models (or checkpoints). # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + # example of running this file: + # python cartpole_trex_onppo_config.py --cfg cartpole_trex_onppo_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_reward_model_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -80,4 +82,5 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_trex_onpolicy((main_config, create_config)) + serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + diff --git a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py index 7682f60c88..83718e6379 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py @@ -26,7 +26,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='halfcheetah_onppo_seed0', hidden_size_list=[512, 64, 1], obs_shape=17, action_shape=6, @@ -88,7 +88,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -96,4 +96,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex_onpolicy([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py index 9a31e1c493..fcd2d3f179 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py @@ -25,7 +25,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='halfcheetah_sac_seed0', hidden_size_list=[512, 64, 1], obs_shape=17, action_shape=6, @@ -90,7 +90,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -98,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/hopper_trex_onppo_config.py b/dizoo/mujoco/config/hopper_trex_onppo_config.py index c14d29d9fd..c905374dbf 100644 --- a/dizoo/mujoco/config/hopper_trex_onppo_config.py +++ b/dizoo/mujoco/config/hopper_trex_onppo_config.py @@ -26,7 +26,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='hopper_onppo_seed0', hidden_size_list=[512, 64, 1], obs_shape=11, action_shape=3, @@ -69,6 +69,7 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ppo', ), + reward_model=dict(type='trex', ), ) hopper_trex_onppo_create_config = EasyDict(hopper_trex_onppo_create_config) create_config = hopper_trex_onppo_create_config @@ -80,7 +81,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -88,4 +89,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex_onpolicy([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/hopper_trex_sac_config.py b/dizoo/mujoco/config/hopper_trex_sac_config.py index e99d0636d8..b582773f86 100644 --- a/dizoo/mujoco/config/hopper_trex_sac_config.py +++ b/dizoo/mujoco/config/hopper_trex_sac_config.py @@ -25,7 +25,7 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', + expert_model_path='hopper_sac_seed0', hidden_size_list=[512, 64, 1], obs_shape=11, action_shape=3, @@ -78,6 +78,7 @@ import_names=['ding.policy.sac'], ), replay_buffer=dict(type='naive', ), + reward_model=dict(type='trex', ), ) hopper_trex_sac_create_config = EasyDict(hopper_trex_sac_create_config) create_config = hopper_trex_sac_create_config @@ -89,7 +90,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -97,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/walker2d_trex_onppo_config.py b/dizoo/mujoco/config/walker2d_trex_onppo_config.py index c53c1efb4b..fea2611956 100644 --- a/dizoo/mujoco/config/walker2d_trex_onppo_config.py +++ b/dizoo/mujoco/config/walker2d_trex_onppo_config.py @@ -12,6 +12,8 @@ stop_value=3000, ), reward_model=dict( + type='trex', + exp_name='walker2d_trex_onppo_seed0', min_snippet_length=30, max_snippet_length=100, checkpoint_min=10000, @@ -24,14 +26,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Walker2d.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='walker2d_onppo_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, @@ -82,7 +80,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex_onpolicy + from ding.entry import serial_pipeline_reward_model_onpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -90,4 +88,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex_onpolicy([main_config, create_config]) + serial_pipeline_reward_model_onpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/walker2d_trex_sac_config.py b/dizoo/mujoco/config/walker2d_trex_sac_config.py index fdd1cab65e..6bad2bc6a9 100644 --- a/dizoo/mujoco/config/walker2d_trex_sac_config.py +++ b/dizoo/mujoco/config/walker2d_trex_sac_config.py @@ -12,6 +12,8 @@ stop_value=6000, ), reward_model=dict( + type='trex', + exp_name='walker2d_trex_sac_seed0', learning_rate=1e-5, min_snippet_length=30, max_snippet_length=100, @@ -23,14 +25,10 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. - expert_model_path='model_path_placeholder', - # Path where to store the reward model - reward_model_path='data_path_placeholder + /Walker2d.params', - # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. - # Absolute path is recommended. - # In DI-engine, it is usually located in ``exp_name`` directory - # See ding/entry/application_entry_trex_collect_data.py to collect the data - data_path='data_path_placeholder', + expert_model_path='walker2d_sac_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=17, + action_shape=6, ), policy=dict( cuda=True, @@ -91,7 +89,7 @@ import argparse import torch from ding.entry import trex_collecting_data - from ding.entry import serial_pipeline_trex + from ding.entry import serial_pipeline_reward_model_offpolicy parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='please enter abs path for this file') parser.add_argument('--seed', type=int, default=0) @@ -99,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_trex([main_config, create_config]) + serial_pipeline_reward_model_offpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) From 594d61926517bd422cf32595f24fed6b04f7f2eb Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 3 May 2023 01:53:17 -0400 Subject: [PATCH 24/52] add gail to new reward entry --- ding/entry/serial_entry_gail.py | 170 ------------------ .../serial/pong/pong_gail_dqn_config.py | 22 ++- .../config/bipedalwalker_gail_sac_config.py | 17 +- .../config/lunarlander_gail_dqn_config.py | 22 ++- .../config/cartpole_dqn_gail_config.py | 26 ++- .../config/cartpole_ngu_config copy.py | 126 ------------- dizoo/mujoco/config/ant_gail_sac_config.py | 19 +- .../config/halfcheetah_gail_sac_config.py | 19 +- .../mujoco/config/walker2d_gail_sac_config.py | 19 +- 9 files changed, 90 insertions(+), 350 deletions(-) delete mode 100644 ding/entry/serial_entry_gail.py delete mode 100644 dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py diff --git a/ding/entry/serial_entry_gail.py b/ding/entry/serial_entry_gail.py deleted file mode 100644 index 4060291fac..0000000000 --- a/ding/entry/serial_entry_gail.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy -import numpy as np - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed -from ding.entry import collect_demo_data -from ding.utils import save_file -from .utils import random_collect - - -def save_reward_model(path, reward_model, weights_name='best'): - path = os.path.join(path, 'reward_model', 'ckpt') - if not os.path.exists(path): - try: - os.makedirs(path) - except FileExistsError: - pass - path = os.path.join(path, 'ckpt_{}.pth.tar'.format(weights_name)) - state_dict = reward_model.state_dict() - save_file(path, state_dict) - print('Saved reward model ckpt in {}'.format(path)) - - -def serial_pipeline_gail( - input_cfg: Tuple[dict, dict], - expert_cfg: Tuple[dict, dict], - seed: int = 0, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), - collect_data: bool = True, -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline entry for GAIL reward model. - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - expert_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Expert config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - - collect_data (:obj:`bool`): Collect expert data. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - if isinstance(expert_cfg, str): - expert_cfg, expert_create_cfg = read_config(expert_cfg) - else: - expert_cfg, expert_create_cfg = expert_cfg - create_cfg.policy.type = create_cfg.policy.type + '_command' - cfg = compile_config(cfg, seed=seed, auto=True, create_cfg=create_cfg, save_cfg=True) - if 'data_path' not in cfg.reward_model: - cfg.reward_model.data_path = cfg.exp_name - # Load expert data - if collect_data: - if expert_cfg.policy.get('other', None) is not None and expert_cfg.policy.other.get('eps', None) is not None: - expert_cfg.policy.other.eps.collect = -1 - if expert_cfg.policy.get('load_path', None) is None: - expert_cfg.policy.load_path = cfg.reward_model.expert_model_path - collect_demo_data( - (expert_cfg, expert_create_cfg), - seed, - state_dict_path=expert_cfg.policy.load_path, - expert_data_path=cfg.reward_model.data_path + '/expert_data.pkl', - collect_count=cfg.reward_model.collect_count - ) - # Create main components: env, policy - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - best_reward = -np.inf - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - reward_mean = np.array([r['eval_episode_return'] for r in reward]).mean() - if reward_mean >= best_reward: - save_reward_model(cfg.exp_name, reward_model, 'best') - best_reward = reward_mean - if stop: - break - new_data_count, target_new_data_count = 0, cfg.reward_model.get('target_new_data_count', 1) - while new_data_count < target_new_data_count: - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - new_data_count += len(new_data) - # collect data for reward_model training - reward_model.collect_data(new_data) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - # update reward_model - reward_model.train() - reward_model.clear_data() - # Learn policy from collected data - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - train_data = replay_buffer.sample(learner.policy.get_attribute('batch_size'), learner.train_iter) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - # update train_data reward using the augmented reward - train_data_augmented = reward_model.estimate(train_data) - learner.train(train_data_augmented, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - save_reward_model(cfg.exp_name, reward_model, 'last') - # evaluate - # evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - return policy diff --git a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py index f11a55ecc1..34cb4d9154 100644 --- a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py +++ b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py @@ -31,7 +31,7 @@ # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. # Absolute path is recommended. # In DI-engine, it is usually located in ``exp_name`` directory - # e.g. 'exp_name/expert_data.pkl' + # e.g. 'exp_name' data_path='data_path_placeholder', ), policy=dict( @@ -80,13 +80,17 @@ # or you can enter `ding -m serial_gail -c pong_gail_dqn_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. pong_dqn_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.atari.config.serial.pong import pong_dqn_config, pong_dqn_create_config - expert_main_config = pong_dqn_config - expert_create_config = pong_dqn_create_config - serial_pipeline_gail( - (main_config, create_config), (expert_main_config, expert_create_config), - max_env_step=1000000, - seed=0, - collect_data=True + + # set your expert config here + expert_cfg = (pong_dqn_config, pong_dqn_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy(main_config, create_config) diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py index 5ceefdb02f..f568a04667 100755 --- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py @@ -87,10 +87,17 @@ # or you can enter `ding -m serial_gail -c bipedalwalker_sac_gail_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. bipedalwalker_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.box2d.bipedalwalker.config import bipedalwalker_sac_config, bipedalwalker_sac_create_config - expert_main_config = bipedalwalker_sac_config - expert_create_config = bipedalwalker_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], seed=0, collect_data=True + + # set your expert config here + expert_cfg = (bipedalwalker_sac_config, bipedalwalker_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy(main_config, create_config) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py index c69506b5ba..3e8742c3dd 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py @@ -29,7 +29,7 @@ # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. # Absolute path is recommended. # In DI-engine, it is usually located in ``exp_name`` directory - # e.g. 'exp_name/expert_data.pkl' + # e.g. 'exp_name' data_path='data_path_placeholder', ), policy=dict( @@ -96,13 +96,17 @@ # or you can enter `ding -m serial_gail -c lunarlander_dqn_gail_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. lunarlander_dqn_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.box2d.lunarlander.config import lunarlander_dqn_config, lunarlander_dqn_create_config - expert_main_config = lunarlander_dqn_config - expert_create_config = lunarlander_dqn_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=1000000, - seed=0, - collect_data=True + + # set your expert config here + expert_cfg = (lunarlander_dqn_config, lunarlander_dqn_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy(main_config, create_config) diff --git a/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py b/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py index df5cf18ad9..4300f417c8 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_dqn_gail_config.py @@ -20,7 +20,10 @@ # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. # If collect_data is True, we will use this expert_model_path to collect expert data first, rather than we # will load data directly from user-defined data_path - expert_model_path='model_path_placeholder', + # data_path is the path to store expert policy data, which is used to train reward model + # so in general, data_path is the same as expert exp name + expert_model_path='cartpole_dqn_seed0/ckpt/ckpt_best.pth.tar', + data_path='cartpole_dqn_seed0', collect_count=1000, ), policy=dict( @@ -68,13 +71,22 @@ # or you can enter `ding -m serial_gail -c cartpole_dqn_gail_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. cartpole_dqn_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.classic_control.cartpole.config import cartpole_dqn_config, cartpole_dqn_create_config + + # set expert config from policy config in dizoo + expert_cfg = (cartpole_dqn_config, cartpole_dqn_create_config) expert_main_config = cartpole_dqn_config - expert_create_config = cartpole_dqn_create_config - serial_pipeline_gail( - (main_config, create_config), (expert_main_config, expert_create_config), - max_env_step=1000000, + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, - collect_data=True + state_dict_path=main_config.reward_model.expert_model_path, + expert_data_path=expert_data_path, + collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py b/dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py deleted file mode 100644 index 1eca4cd0bc..0000000000 --- a/dizoo/classic_control/cartpole/config/cartpole_ngu_config copy.py +++ /dev/null @@ -1,126 +0,0 @@ -from easydict import EasyDict - -collector_env_num = 8 -evaluator_env_num = 8 -nstep = 5 -cartpole_ppo_rnd_config = dict( - exp_name='cartpole_ngu_seed0', - env=dict( - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - n_evaluator_episode=evaluator_env_num, - #'ALE/Pong-v5' is available. But special setting is needed after gym make. - obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy - stop_value=195, - ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=4, - action_shape=2, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=4, - action_shape=2, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', - ), - policy=dict( - cuda=True, - on_policy=False, - priority=True, - priority_IS_weight=True, - discount_factor=0.997, - nstep=nstep, - burnin_step=2, - # (int) is the total length of [sequence sample] minus - # the length of burnin part in [sequence sample], - # i.e., = = + - learn_unroll_len=40, # set this key according to the episode length - model=dict( - obs_shape=4, - action_shape=2, - encoder_hidden_size_list=[128, 128, 512], - collector_env_num=collector_env_num, - ), - learn=dict( - update_per_collect=8, - batch_size=64, - learning_rate=0.0005, - target_update_theta=0.001, - ), - collect=dict( - # NOTE: It is important that set key traj_len_inf=True here, - # to make sure self._traj_len=INF in serial_sample_collector.py. - # In sequence-based policy, for each collect_env, - # we want to collect data of length self._traj_len=INF - # unless the episode enters the 'done' state. - # In each collect phase, we collect a total of sequence samples. - n_sample=32, - traj_len_inf=True, - env_num=collector_env_num, - ), - eval=dict(env_num=evaluator_env_num, ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.05, - decay=1e5, - ), - replay_buffer=dict( - replay_buffer_size=int(2e4), - # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization - alpha=0.6, - # (Float type) How much correction is used: 0 means no correction while 1 means full correction - beta=0.4, - ) - ), - ), -) -cartpole_ppo_rnd_config = EasyDict(cartpole_ppo_rnd_config) -main_config = cartpole_ppo_rnd_config -cartpole_ppo_rnd_create_config = dict( - env=dict( - type='cartpole', - import_names=['dizoo.classic_control.cartpole.envs.cartpole_env'], - ), - env_manager=dict(type='base'), - policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), -) -cartpole_ppo_rnd_create_config = EasyDict(cartpole_ppo_rnd_create_config) -create_config = cartpole_ppo_rnd_create_config - -if __name__ == "__main__": - # or you can enter `ding -m serial_ngu -c pong_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) diff --git a/dizoo/mujoco/config/ant_gail_sac_config.py b/dizoo/mujoco/config/ant_gail_sac_config.py index 53fd375318..7072f38142 100644 --- a/dizoo/mujoco/config/ant_gail_sac_config.py +++ b/dizoo/mujoco/config/ant_gail_sac_config.py @@ -88,14 +88,17 @@ # or you can enter `ding -m serial_gail -c ant_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.ant_sac_config import ant_sac_config, ant_sac_create_config - expert_main_config = ant_sac_config - expert_create_config = ant_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=10000000, - seed=0, - collect_data=True + # set your expert config here + expert_cfg = (ant_sac_config, ant_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy(main_config, create_config) diff --git a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py index 2b0ee0a5a1..be0d6d2ef8 100644 --- a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py @@ -87,14 +87,17 @@ # or you can enter `ding -m serial_gail -c ant_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.halfcheetah_sac_config import halfcheetah_sac_config, halfcheetah_sac_create_config - expert_main_config = halfcheetah_sac_config - expert_create_config = halfcheetah_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=10000000, - seed=0, - collect_data=True + # set your expert config here + expert_cfg = (halfcheetah_sac_config, halfcheetah_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy(main_config, create_config) diff --git a/dizoo/mujoco/config/walker2d_gail_sac_config.py b/dizoo/mujoco/config/walker2d_gail_sac_config.py index 7bd2de9022..a4e053c8cc 100644 --- a/dizoo/mujoco/config/walker2d_gail_sac_config.py +++ b/dizoo/mujoco/config/walker2d_gail_sac_config.py @@ -88,14 +88,17 @@ # or you can enter `ding -m serial_gail -c ant_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.walker2d_sac_config import walker2d_sac_config, walker2d_sac_create_config - expert_main_config = walker2d_sac_config - expert_create_config = walker2d_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=5000000, - seed=0, - collect_data=True + # set your expert config here + expert_cfg = (walker2d_sac_config, walker2d_sac_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy(main_config, create_config) From 58a2bff19f05ca9482eeff5c645405544f87a86f Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 3 May 2023 02:00:51 -0400 Subject: [PATCH 25/52] remove preferenced based irl entry(used for trex, drex before) --- ding/entry/__init__.py | 5 - .../serial_entry_preference_based_irl.py | 133 ------------------ ...ial_entry_preference_based_irl_onpolicy.py | 104 -------------- 3 files changed, 242 deletions(-) delete mode 100644 ding/entry/serial_entry_preference_based_irl.py delete mode 100644 ding/entry/serial_entry_preference_based_irl_onpolicy.py diff --git a/ding/entry/__init__.py b/ding/entry/__init__.py index e0501b12db..e04299ed96 100644 --- a/ding/entry/__init__.py +++ b/ding/entry/__init__.py @@ -18,12 +18,7 @@ episode_to_transitions, episode_to_transitions_filter from .application_entry_trex_collect_data import trex_collecting_data, collect_episodic_demo_data_for_trex from .serial_entry_guided_cost import serial_pipeline_guided_cost -from .serial_entry_gail import serial_pipeline_gail from .utils import random_collect -from .serial_entry_preference_based_irl \ - import serial_pipeline_preference_based_irl -from .serial_entry_preference_based_irl_onpolicy \ - import serial_pipeline_preference_based_irl_onpolicy from .application_entry_drex_collect_data import drex_collecting_data from .serial_entry_mbrl import serial_pipeline_dyna, serial_pipeline_dream from .serial_entry_bco import serial_pipeline_bco diff --git a/ding/entry/serial_entry_preference_based_irl.py b/ding/entry/serial_entry_preference_based_irl.py deleted file mode 100644 index 682e662baa..0000000000 --- a/ding/entry/serial_entry_preference_based_irl.py +++ /dev/null @@ -1,133 +0,0 @@ -import copy -from typing import Union, Optional, List, Any, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy, PolicyFactory -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed - - -def serial_pipeline_preference_based_irl( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - serial_pipeline_preference_based_irl. - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_iterations (:obj:`Optional[torch.nn.Module]`): Learner's max iteration. Pipeline will stop \ - when reaching this iteration. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - create_cfg.reward_model = dict(type=cfg.reward_model.type) - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True, renew_dir=False) - cfg_bak = copy.deepcopy(cfg) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - - reward_model = create_reward_model(cfg_bak, policy.collect_mode.get_attribute('device'), tb_logger) - reward_model.train() - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - if cfg.policy.get('transition_with_policy_data', False): - collector.reset_policy(policy.collect_mode) - else: - action_space = collector_env.env_info().act_space - random_policy = PolicyFactory.get_random_policy(policy.collect_mode, action_space=action_space) - collector.reset_policy(random_policy) - collect_kwargs = commander.step() - new_data = collector.collect(n_sample=cfg.policy.random_collect_size, policy_kwargs=collect_kwargs) - replay_buffer.push(new_data, cur_collector_envstep=0) - collector.reset_policy(policy.collect_mode) - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - # Learn policy from collected data - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - train_data = replay_buffer.sample(learner.policy.get_attribute('batch_size'), learner.train_iter) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - # update train_data reward using the augmented reward - train_data_augmented = reward_model.estimate(train_data) - learner.train(train_data_augmented, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - return policy diff --git a/ding/entry/serial_entry_preference_based_irl_onpolicy.py b/ding/entry/serial_entry_preference_based_irl_onpolicy.py deleted file mode 100644 index 3941f3337e..0000000000 --- a/ding/entry/serial_entry_preference_based_irl_onpolicy.py +++ /dev/null @@ -1,104 +0,0 @@ -from typing import Union, Optional, List, Any, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy, PolicyFactory -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed - - -def serial_pipeline_preference_based_irl_onpolicy( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline entry for preference based irl of on-policy algorithm(such as PPO). - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - create_cfg.reward_model = dict(type=cfg.reward_model.type) - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True, renew_dir=False) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, None, policy.command_mode - ) - reward_model = create_reward_model(cfg, policy.collect_mode.get_attribute('device'), tb_logger) - reward_model.train() - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter) - train_data = new_data - # update train_data reward using the augmented reward - train_data_augmented = reward_model.estimate(train_data) - learner.train(train_data_augmented, collector.envstep) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - return policy From 822d7a45bacb9c4711cdf45b011bff6cd165f973 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Wed, 3 May 2023 02:28:25 -0400 Subject: [PATCH 26/52] remove unuse code in gcl --- ding/reward_model/guided_cost_reward_model.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/ding/reward_model/guided_cost_reward_model.py b/ding/reward_model/guided_cost_reward_model.py index 9ce49f3157..bcc134de8d 100644 --- a/ding/reward_model/guided_cost_reward_model.py +++ b/ding/reward_model/guided_cost_reward_model.py @@ -13,27 +13,6 @@ from .network import GCLNetwork -class GuidedCostNN(nn.Module): - - def __init__( - self, - input_size, - hidden_size=128, - output_size=1, - ): - super(GuidedCostNN, self).__init__() - self.net = nn.Sequential( - nn.Linear(input_size, hidden_size), - nn.ReLU(), - nn.Linear(hidden_size, hidden_size), - nn.ReLU(), - nn.Linear(hidden_size, output_size), - ) - - def forward(self, x): - return self.net(x) - - @REWARD_MODEL_REGISTRY.register('guided_cost') class GuidedCostRewardModel(BaseRewardModel): """ From be03aa99bfe9860c08defd77d84678d67491d347 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 02:44:38 -0400 Subject: [PATCH 27/52] change clear data from pipeline to RM && add ngu to new entry --- .../serial_entry_reward_model_offpolicy.py | 5 +- .../serial_entry_reward_model_onpolicy.py | 3 +- .../tests/test_serial_entry_reward_model.py | 10 +++ ding/policy/ngu.py | 2 +- ding/reward_model/base_reward_model.py | 2 +- ding/reward_model/gail_irl_model.py | 8 +- ding/reward_model/guided_cost_reward_model.py | 2 +- ding/reward_model/icm_reward_model.py | 14 +-- ding/reward_model/ngu_reward_model.py | 90 +++++++++++++++++++ ding/reward_model/pdeil_irl_model.py | 8 +- ding/reward_model/pwil_irl_model.py | 8 +- ding/reward_model/red_irl_model.py | 2 +- ding/reward_model/rnd_reward_model.py | 8 +- ding/reward_model/trex_reward_model.py | 7 +- .../serial/montezuma/montezuma_ngu_config.py | 88 +++++++++--------- .../serial/pitfall/pitfall_ngu_config.py | 88 +++++++++--------- .../config/serial/pong/pong_ngu_config.py | 88 +++++++++--------- .../config/lunarlander_ngu_config.py | 88 +++++++++--------- .../cartpole/config/cartpole_ngu_config.py | 88 +++++++++--------- dizoo/minigrid/config/minigrid_ngu_config.py | 88 +++++++++--------- 20 files changed, 420 insertions(+), 277 deletions(-) diff --git a/ding/entry/serial_entry_reward_model_offpolicy.py b/ding/entry/serial_entry_reward_model_offpolicy.py index d3f6c59390..c9cbc40259 100644 --- a/ding/entry/serial_entry_reward_model_offpolicy.py +++ b/ding/entry/serial_entry_reward_model_offpolicy.py @@ -115,9 +115,8 @@ def serial_pipeline_reward_model_offpolicy( # update reward_model, when you want to train reward_model inloop if cooptrain_reward: reward_model.train() - # clear buffer per fix iters to make sure replay buffer's data count isn't too few. - if hasattr(cfg.reward_model, 'clear_buffer_per_iters') and count % cfg.reward_model.clear_buffer_per_iters == 0: - reward_model.clear_data() + # clear buffer per fix iters to make sure replay buffer's data count isn't too few. + reward_model.clear_data(iter=count) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): # Learner will train ``update_per_collect`` times in one iteration. diff --git a/ding/entry/serial_entry_reward_model_onpolicy.py b/ding/entry/serial_entry_reward_model_onpolicy.py index f67e230db4..975b32f461 100644 --- a/ding/entry/serial_entry_reward_model_onpolicy.py +++ b/ding/entry/serial_entry_reward_model_onpolicy.py @@ -114,8 +114,7 @@ def serial_pipeline_reward_model_onpolicy( # update reward_model if cooptrain_reward: reward_model.train() - if hasattr(cfg.reward_model, 'clear_buffer_per_iters') and count % cfg.reward_model.clear_buffer_per_iters == 0: - reward_model.clear_data() + reward_model.clear_data(iter=count) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): # Learner will train ``update_per_collect`` times in one iteration. diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 89c1bf49ff..1978e7ae67 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -10,6 +10,7 @@ from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_rnd_onppo_config import cartpole_ppo_rnd_config, cartpole_ppo_rnd_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_ppo_icm_config import cartpole_ppo_icm_config, cartpole_ppo_icm_create_config # noqa +from dizoo.classic_control.cartpole.config.cartpole_ngu_config import cartpole_ngu_config, cartpole_ngu_create_config from ding.entry import serial_pipeline, collect_demo_data, serial_pipeline_reward_model_offpolicy, \ serial_pipeline_reward_model_onpolicy from ding.entry.application_entry_trex_collect_data import trex_collecting_data @@ -116,3 +117,12 @@ def test_icm(): serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + + +@pytest.mark.unittest +def test_ngu(): + config = [deepcopy(cartpole_ngu_config), deepcopy(cartpole_ngu_create_config)] + try: + serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) + except Exception: + assert False, "pipeline fail" diff --git a/ding/policy/ngu.py b/ding/policy/ngu.py index 95fe2dd82a..efe2ee5b80 100644 --- a/ding/policy/ngu.py +++ b/ding/policy/ngu.py @@ -431,7 +431,7 @@ def _init_collect(self) -> None: # epsilon=0.4, alpha=9 self.eps = {i: 0.4 ** (1 + 8 * i / (self._cfg.collect.env_num - 1)) for i in range(self._cfg.collect.env_num)} - def _forward_collect(self, data: dict) -> dict: + def _forward_collect(self, data: dict, eps: Optional[float]) -> dict: r""" Overview: Collect output according to eps_greedy plugin diff --git a/ding/reward_model/base_reward_model.py b/ding/reward_model/base_reward_model.py index 963bacf1d7..930975e8da 100644 --- a/ding/reward_model/base_reward_model.py +++ b/ding/reward_model/base_reward_model.py @@ -60,7 +60,7 @@ def collect_data(self, data) -> None: raise NotImplementedError() @abstractmethod - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: Clearing training data. \ diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index 2db47be052..b638671ac1 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -212,10 +212,14 @@ def collect_data(self, data: list) -> None: data = torch.unbind(data, dim=0) self.train_data.extend(data) - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.train_data.clear() + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() diff --git a/ding/reward_model/guided_cost_reward_model.py b/ding/reward_model/guided_cost_reward_model.py index bcc134de8d..8121b30c63 100644 --- a/ding/reward_model/guided_cost_reward_model.py +++ b/ding/reward_model/guided_cost_reward_model.py @@ -132,7 +132,7 @@ def collect_data(self, data) -> None: # if online_net is trained continuously, there should be some implementations in collect_data method pass - def clear_data(self): + def clear_data(self, iter: int): """ Overview: Collecting clearing data, not implemented if reward model (i.e. online_net) is only trained ones, \ diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index 89d76daa3b..f3a9dbfbba 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -175,11 +175,15 @@ def collect_data(self, data: list) -> None: self.train_next_states.extend(next_states) self.train_actions.extend(actions) - def clear_data(self) -> None: - self.train_data.clear() - self.train_states.clear() - self.train_next_states.clear() - self.train_actions.clear() + def clear_data(self, iter: int) -> None: + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() + self.train_states.clear() + self.train_next_states.clear() + self.train_actions.clear() def state_dict(self) -> Dict: return self.reward_model.state_dict() diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index 79320bd69f..34ca9e7426 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -1,5 +1,6 @@ import copy import random +from typing import Any import numpy as np import torch @@ -408,3 +409,92 @@ def fusion_reward( int(data[i]['beta'][j])] return data, estimate_cnt + + +@REWARD_MODEL_REGISTRY.register('ngu-reward') +class NGURewardModel(BaseRewardModel): + r""" + Overview: + The unifying reward for ngu which combined rnd-ngu and episodic + The corresponding paper is `never give up: learning directed exploration strategies`. + """ + config = dict( + type='ngu-reward', + policy_nstep=5, + collect_env_num=8, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=4, + action_shape=2, + batch_size=128, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=5, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + last_nonzero_reward_rescale=False, + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=4, + action_shape=2, + batch_size=128, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=5, + hidden_size_list=[128, 128, 64], + type='episodic', + ), + ) + + def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: + super(NGURewardModel).__init__() + self.cfg = config + self.tb_logger = tb_logger + self.estimate_cnt = 0 + self.rnd_reward_model = RndNGURewardModel(config.rnd_reward_model, device, tb_logger) + self.episodic_reward_model = EpisodicNGURewardModel(config.episodic_reward_model, device, tb_logger) + + def train(self) -> None: + self.rnd_reward_model.train() + self.episodic_reward_model.train() + + def estimate(self, data: list) -> dict: + + # estimate reward + rnd_reward = self.rnd_reward_model.estimate(data) + episodic_reward = self.episodic_reward_model.estimate(data) + + # combine reward + train_data_augumented, self.estimate_cnt = self.episodic_reward_model.fusion_reward( + data, + episodic_reward, + rnd_reward, + nstep=self.cfg.policy_nstep, + collector_env_num=self.cfg.collect_env_num, + tb_logger=self.tb_logger, + estimate_cnt=self.estimate_cnt + ) + + return train_data_augumented + + def collect_data(self, data) -> None: + self.rnd_reward_model.collect_data(data) + self.episodic_reward_model.collect_data(data) + + def clear_data(self, iter: int) -> None: + assert hasattr( + self.cfg.rnd_reward_model, 'clear_buffer_per_iters' + ), "RND Reward Model does not have clear_buffer_per_iters, Clear failed" + assert hasattr( + self.cfg.episodic_reward_model, 'clear_buffer_per_iters' + ), "Episodic Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.rnd_reward_model.clear_buffer_per_iters == 0: + self.rnd_reward_model.clear_data() + if iter % self.cfg.episodic_reward_model.clear_buffer_per_iters == 0: + self.episodic_reward_model.clear_data() diff --git a/ding/reward_model/pdeil_irl_model.py b/ding/reward_model/pdeil_irl_model.py index 6e47068e4d..afe2d8fce9 100644 --- a/ding/reward_model/pdeil_irl_model.py +++ b/ding/reward_model/pdeil_irl_model.py @@ -210,10 +210,14 @@ def collect_data(self, item: list): """ self.train_data.extend(item) - def clear_data(self): + def clear_data(self, iter: int): """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.train_data.clear() + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() diff --git a/ding/reward_model/pwil_irl_model.py b/ding/reward_model/pwil_irl_model.py index ba777f7cb9..465cc16d0f 100644 --- a/ding/reward_model/pwil_irl_model.py +++ b/ding/reward_model/pwil_irl_model.py @@ -255,10 +255,14 @@ def _train(self, data: list): reward = self.cfg.alpha * math.exp(self.reward_factor * c) self.reward_table[(s, a)] = torch.FloatTensor([reward]) - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.train_data.clear() + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_data.clear() diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index bcc816af60..d5ff636ba7 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -168,7 +168,7 @@ def collect_data(self, data) -> None: # if online_net is trained continuously, there should be some implementations in collect_data method pass - def clear_data(self): + def clear_data(self, iter: int): """ Overview: Collecting clearing data, not implemented if reward model (i.e. online_net) is only trained ones, \ diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 4588b42bcf..d7ae37e787 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -164,8 +164,12 @@ def estimate(self, data: list) -> List[Dict]: def collect_data(self, data: list) -> None: self.train_obs.extend(collect_states(data)) - def clear_data(self) -> None: - self.train_obs.clear() + def clear_data(self, iter: int) -> None: + assert hasattr( + self.cfg, 'clear_buffer_per_iters' + ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + if iter % self.cfg.clear_buffer_per_iters == 0: + self.train_obs.clear() def state_dict(self) -> Dict: return self.reward_model.state_dict() diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 25c700a40a..034b9bbd32 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -327,11 +327,12 @@ def collect_data(self, data: list) -> None: """ pass - def clear_data(self) -> None: + def clear_data(self, iter: int) -> None: """ Overview: Clearing training data. \ This is a side effect function which clears the data attribute in ``self`` """ - self.training_obs.clear() - self.training_labels.clear() + if hasattr(self.cfg, 'clear_buffer_per_iters') and iter % self.cfg.clear_buffer_per_iters == 0: + self.training_obs.clear() + self.training_labels.clear() diff --git a/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py b/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py index b8dfec3526..64e2173e0e 100644 --- a/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py +++ b/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py @@ -15,44 +15,49 @@ stop_value=int(1e5), frame_stack=4, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=0.001, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=0.001, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, # 32*100/64=50 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=0.001, + obs_shape=[4, 84, 84], + action_shape=18, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=0.001, + obs_shape=[4, 84, 84], + action_shape=18, + batch_size=320, + update_per_collect=10, # 32*100/64=50 + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -116,12 +121,11 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) montezuma_ppo_rnd_create_config = EasyDict(montezuma_ppo_rnd_create_config) create_config = montezuma_ppo_rnd_create_config if __name__ == "__main__": - from ding.entry import serial_pipeline_reward_model_ngu - serial_pipeline_reward_model_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py b/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py index 55f275333a..d0f8ece1d2 100644 --- a/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py +++ b/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py @@ -22,44 +22,49 @@ stop_value=int(1e5), frame_stack=4, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', # 'assign' - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', # 'assign' + learning_rate=1e-4, + obs_shape=[4, 84, 84], + action_shape=18, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=[4, 84, 84], + action_shape=18, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -123,12 +128,11 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) pitfall_ppo_rnd_create_config = EasyDict(pitfall_ppo_rnd_create_config) create_config = pitfall_ppo_rnd_create_config if __name__ == "__main__": - from ding.entry import serial_pipeline_reward_model_ngu - serial_pipeline_reward_model_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_ngu_config.py b/dizoo/atari/config/serial/pong/pong_ngu_config.py index 231f37d907..39527994c1 100644 --- a/dizoo/atari/config/serial/pong/pong_ngu_config.py +++ b/dizoo/atari/config/serial/pong/pong_ngu_config.py @@ -15,44 +15,49 @@ stop_value=20, frame_stack=4, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=6, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=6, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=[4, 84, 84], + action_shape=6, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=1e-4, + obs_shape=[4, 84, 84], + action_shape=6, + batch_size=320, + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -116,13 +121,12 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) pong_ppo_rnd_create_config = EasyDict(pong_ppo_rnd_create_config) create_config = pong_ppo_rnd_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c pong_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py index d4cb7cfe26..ee60926c4c 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py @@ -13,44 +13,49 @@ n_evaluator_episode=evaluator_env_num, stop_value=195, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=8, - action_shape=4, - batch_size=320, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=5, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=True, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=100, - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=8, - action_shape=4, - batch_size=320, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=8, + action_shape=4, + batch_size=320, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=5, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=True, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=100, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=8, + action_shape=4, + batch_size=320, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -118,13 +123,12 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) lunarlander_ngu_create_config = EasyDict(lunarlander_ngu_create_config) create_config = lunarlander_ngu_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c lunarlander_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py b/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py index 3aecbbb01b..df004328f2 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_ngu_config.py @@ -12,44 +12,49 @@ obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy stop_value=195, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=4, - action_shape=2, - batch_size=128, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=4, - action_shape=2, - batch_size=128, # transitions - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=5, + collect_env_num=8, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=4, + action_shape=2, + batch_size=128, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=False, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=1, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=4, + action_shape=2, + batch_size=128, # transitions + update_per_collect=10, + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -112,13 +117,12 @@ ), env_manager=dict(type='base'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) cartpole_ngu_create_config = EasyDict(cartpole_ngu_create_config) create_config = cartpole_ngu_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c cartpole_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/minigrid/config/minigrid_ngu_config.py b/dizoo/minigrid/config/minigrid_ngu_config.py index c1aa47e1eb..0369da1924 100644 --- a/dizoo/minigrid/config/minigrid_ngu_config.py +++ b/dizoo/minigrid/config/minigrid_ngu_config.py @@ -17,44 +17,49 @@ max_step=300, stop_value=0.96, ), - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=2835, - action_shape=7, - batch_size=320, # transitions - update_per_collect=10, # 32*100/320=10 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=True, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=100, - intrinsic_reward_type='add', - learning_rate=5e-4, - obs_shape=2739, - action_shape=7, - batch_size=320, # transitions - update_per_collect=10, # 32*100/64=50 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', + reward_model=dict( + type='ngu-reward', + policy_nstep=nstep, + collect_env_num=collector_env_num, + rnd_reward_model=dict( + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=2835, + action_shape=7, + batch_size=320, # transitions + update_per_collect=10, # 32*100/320=10 + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='rnd-ngu', + ), + episodic_reward_model=dict( + # means if using rescale trick to the last non-zero reward + # when combing extrinsic and intrinsic reward. + # the rescale trick only used in: + # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal + # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, + # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the + # original last nonzero extrinsic reward. + # please refer to ngu_reward_model for details. + last_nonzero_reward_rescale=True, + # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True + # please refer to ngu_reward_model for details. + last_nonzero_reward_weight=100, + intrinsic_reward_type='add', + learning_rate=5e-4, + obs_shape=2739, + action_shape=7, + batch_size=320, # transitions + update_per_collect=10, # 32*100/64=50 + only_use_last_five_frames_for_icm_rnd=False, + clear_buffer_per_iters=10, + nstep=nstep, + hidden_size_list=[128, 128, 64], + type='episodic', + ), ), policy=dict( cuda=True, @@ -117,13 +122,12 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='ngu'), - rnd_reward_model=dict(type='rnd-ngu'), - episodic_reward_model=dict(type='episodic'), + reward_model=dict(type='ngu-reward'), ) minigrid_ppo_ngu_create_config = EasyDict(minigrid_ppo_ngu_create_config) create_config = minigrid_ppo_ngu_create_config if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c minigrid_ngu_config.py -s 0` - from ding.entry import serial_pipeline_ngu - serial_pipeline_ngu([main_config, create_config], seed=0) + from ding.entry import serial_pipeline_reward_model_offpolicy + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) From d3ce3e276929fb98e2a827ff59cb70aad7d1d5da Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 02:45:22 -0400 Subject: [PATCH 28/52] remove ngu old entry --- ding/entry/serial_entry_ngu.py | 171 --------------------------------- 1 file changed, 171 deletions(-) delete mode 100644 ding/entry/serial_entry_ngu.py diff --git a/ding/entry/serial_entry_ngu.py b/ding/entry/serial_entry_ngu.py deleted file mode 100644 index 176f5558cd..0000000000 --- a/ding/entry/serial_entry_ngu.py +++ /dev/null @@ -1,171 +0,0 @@ -from typing import Union, Optional, List, Any, Tuple -import os -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed -from .utils import random_collect - - -def serial_pipeline_ngu( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline entry for NGU. The corresponding paper is - `never give up: learning directed exploration strategies`. - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - # if you want to save replay, please uncomment this line - # evaluator_env.enable_save_replay(cfg.env.replay_path) - - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - rnd_reward_model = create_reward_model(cfg.rnd_reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - episodic_reward_model = create_reward_model( - cfg.episodic_reward_model, policy.collect_mode.get_attribute('device'), tb_logger - ) - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - - estimate_cnt = 0 - iter_ = 0 - while True: - """some hyper-parameters used in NGU""" - # index_to_eps = {i: 0.4 ** (1 + 8 * i / (self._env_num - 1)) for i in range(self._env_num)} - # index_to_beta = { - # i: 0.3 * torch.sigmoid(torch.tensor(10 * (2 * i - (collector_env_num - 2)) / (collector_env_num - 2))) - # for i in range(collector_env_num) - # } - # index_to_gamma = { - # i: 1 - torch.exp( - # ( - # (collector_env_num - 1 - i) * torch.log(torch.tensor(1 - 0.997)) + - # i * torch.log(torch.tensor(1 - 0.99)) - # ) / (collector_env_num - 1) - # ) - # for i in range(collector_env_num) - # } - iter_ += 1 - - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=None) - - # collect data for reward_model training - rnd_reward_model.collect_data(new_data) - episodic_reward_model.collect_data(new_data) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - - # update reward_model - rnd_reward_model.train() - if (iter_ + 1) % cfg.rnd_reward_model.clear_buffer_per_iters == 0: - rnd_reward_model.clear_data() - episodic_reward_model.train() - if (iter_ + 1) % cfg.episodic_reward_model.clear_buffer_per_iters == 0: - episodic_reward_model.clear_data() - - # Learn policy from collected data - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - train_data = replay_buffer.sample(learner.policy.get_attribute('batch_size'), learner.train_iter) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - # calculate the inter-episodic and episodic intrinsic reward - rnd_reward = rnd_reward_model.estimate(train_data) - episodic_reward = episodic_reward_model.estimate(train_data) - - # update train_data reward using the augmented reward - train_data_augmented, estimate_cnt = episodic_reward_model.fusion_reward( - train_data, - rnd_reward, - episodic_reward, - nstep=cfg.policy.nstep, - collector_env_num=cfg.policy.collect.env_num, - tb_logger=tb_logger, - estimate_cnt=estimate_cnt - ) - learner.train(train_data_augmented, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - - # Learner's after_run hook. - learner.call_hook('after_run') - return policy From 4c19aa34271b02d47c38a27b64ffbcd139988c6b Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 02:52:54 -0400 Subject: [PATCH 29/52] fix env pool test bug --- ding/entry/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/entry/__init__.py b/ding/entry/__init__.py index e04299ed96..0dfaa73db3 100644 --- a/ding/entry/__init__.py +++ b/ding/entry/__init__.py @@ -5,7 +5,6 @@ from .serial_entry_onpolicy import serial_pipeline_onpolicy from .serial_entry_onpolicy_ppg import serial_pipeline_onpolicy_ppg from .serial_entry_offline import serial_pipeline_offline -from .serial_entry_ngu import serial_pipeline_ngu from .serial_entry_decision_transformer import serial_pipeline_dt from .serial_entry_reward_model_offpolicy import serial_pipeline_reward_model_offpolicy from .serial_entry_reward_model_onpolicy import serial_pipeline_reward_model_onpolicy From 0cc21494bdb7ae206a7efa8dfc9b68948e67e1b1 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 05:29:12 -0400 Subject: [PATCH 30/52] add drex to new entry --- ...est_application_entry_drex_collect_data.py | 4 +- ding/reward_model/drex_reward_model.py | 20 +++++++- ding/reward_model/ngu_reward_model.py | 2 +- .../config/cartpole_drex_dqn_config.py | 47 +++++++++++++++---- 4 files changed, 59 insertions(+), 14 deletions(-) diff --git a/ding/entry/tests/test_application_entry_drex_collect_data.py b/ding/entry/tests/test_application_entry_drex_collect_data.py index de1ded03ee..c7942fdfaf 100644 --- a/ding/entry/tests/test_application_entry_drex_collect_data.py +++ b/ding/entry/tests/test_application_entry_drex_collect_data.py @@ -70,5 +70,5 @@ def test_drex_collecting_data(): args.cfg[0].bc_iteration = 1000 # for unittest args.cfg[1].policy.type = 'bc' drex_collecting_data(args=args) - os.popen('rm -rf {}'.format(expert_policy_state_dict_path)) - os.popen('rm -rf {}'.format(args.cfg[0].reward_model.offline_data_path)) + #os.popen('rm -rf {}'.format(expert_policy_state_dict_path)) + #os.popen('rm -rf {}'.format(args.cfg[0].reward_model.offline_data_path)) diff --git a/ding/reward_model/drex_reward_model.py b/ding/reward_model/drex_reward_model.py index 645b469088..a4727e6d12 100644 --- a/ding/reward_model/drex_reward_model.py +++ b/ding/reward_model/drex_reward_model.py @@ -1,6 +1,7 @@ import copy from easydict import EasyDict import pickle +import numpy as np from ding.utils import REWARD_MODEL_REGISTRY @@ -77,11 +78,26 @@ def load_expert_data(self) -> None: """ super(DrexRewardModel, self).load_expert_data() - with open(self.cfg.reward_model.offline_data_path + '/suboptimal_data.pkl', 'rb') as f: + with open(self.cfg.offline_data_path + '/suboptimal_data.pkl', 'rb') as f: self.demo_data = pickle.load(f) def train(self): - self._train() + # check if gpu available + device = self.device # torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + # Assume that we are on a CUDA machine, then this should print a CUDA device: + self._logger.info("device: {}".format(device)) + training_inputs, training_outputs = self.training_obs, self.training_labels + + cum_loss = 0.0 + training_data = list(zip(training_inputs, training_outputs)) + for epoch in range(self.cfg.update_per_collect): + np.random.shuffle(training_data) + training_obs, training_labels = zip(*training_data) + cum_loss = self._train(training_obs, training_labels) + self.train_iter += 1 + self._logger.info("[epoch {}] loss {}".format(epoch, cum_loss)) + self.tb_logger.add_scalar("drex_reward/train_loss_iteration", cum_loss, self.train_iter) + return_dict = self.pred_data(self.demo_data) res, pred_returns = return_dict['real'], return_dict['pred'] self._logger.info("real: " + str(res)) diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index 34ca9e7426..1c8ebe1139 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -453,7 +453,7 @@ class NGURewardModel(BaseRewardModel): ) def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: - super(NGURewardModel).__init__() + super(NGURewardModel, self).__init__() self.cfg = config self.tb_logger = tb_logger self.estimate_cnt = 0 diff --git a/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py index c898528a39..f30134f35c 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py @@ -11,11 +11,12 @@ ), reward_model=dict( type='drex', + exp_name='cartpole_drex_dqn_seed0', min_snippet_length=5, max_snippet_length=100, checkpoint_min=0, - checkpoint_max=1000, - checkpoint_step=1000, + checkpoint_max=760, + checkpoint_step=760, learning_rate=1e-5, update_per_collect=1, # path to expert models that generate demonstration data @@ -23,25 +24,30 @@ # Absolute path is recommended. # In DI-engine, it is ``exp_name``. # For example, if you want to use dqn to generate demos, you can use ``spaceinvaders_dqn`` - expert_model_path='expert_model_path_placeholder', + expert_model_path='cartpole_dqn_seed0/ckpt/ckpt_best.pth.tar', # path to save reward model # Users should add their own model path here. # Absolute path is recommended. # For example, if you use ``spaceinvaders_drex``, then the reward model will be saved in this directory. - reward_model_path='reward_model_path_placeholder + ./spaceinvaders.params', + reward_model_path='cartpole_drex_dqn_seed0/cartpole.params', # path to save generated observations. # Users should add their own model path here. # Absolute path is recommended. # For example, if you use ``spaceinvaders_drex``, then all the generated data will be saved in this directory. - offline_data_path='offline_data_path_placeholder', + offline_data_path='cartpole_drex_dqn_seed0', # path to pretrained bc model. If omitted, bc will be trained instead. # Users should add their own model path here. Model path should lead to a model ckpt. # Absolute path is recommended. - bc_path='bc_path_placeholder', + # bc_path='bc_path_placeholder', # list of noises eps_list=[0, 0.5, 1], num_trajs_per_bin=20, + num_trajs=6, + num_snippets=6000, bc_iterations=6000, + hidden_size_list=[512, 64, 1], + obs_shape=4, + action_shape=2, ), policy=dict( cuda=False, @@ -57,7 +63,13 @@ batch_size=64, learning_rate=0.001, ), - collect=dict(n_sample=8, collector=dict(get_train_sample=False, )), + collect=dict( + n_sample=8, + collector=dict( + get_train_sample=False, + reward_shaping=False, + ), + ), eval=dict(evaluator=dict(eval_freq=40, )), other=dict( eps=dict( @@ -66,7 +78,7 @@ end=0.1, decay=10000, ), - replay_buffer=dict(replay_buffer_size=20000, ), + replay_buffer=dict(replay_buffer_size=200000, ), ), ), ) @@ -79,7 +91,24 @@ ), env_manager=dict(type='subprocess'), policy=dict(type='dqn'), - collector=dict(type='episode'), ) cartpole_drex_dqn_create_config = EasyDict(cartpole_drex_dqn_create_config) create_config = cartpole_drex_dqn_create_config + +if __name__ == "__main__": + import argparse + import torch + from ding.config import read_config + from ding.entry import drex_collecting_data + from ding.entry import serial_pipeline_reward_model_offpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + args.cfg = read_config(args.cfg) + args.cfg[1].policy.type = 'bc' + args.cfg[0].policy.collect.n_episode = 8 + del args.cfg[0].policy.collect.n_sample + drex_collecting_data(args) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) From 5b4e4cc20f080b701b78e5be0be0555d3d66cc7c Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 05:37:40 -0400 Subject: [PATCH 31/52] fix unit test for trex and gail --- .../tests/test_serial_entry_reward_model.py | 57 ++++++++++--------- ding/reward_model/__init__.py | 2 +- ding/reward_model/red_irl_model.py | 3 - .../reward_model/tests/test_gail_irl_model.py | 6 +- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 1978e7ae67..875602c204 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -38,20 +38,6 @@ 'hidden_size_list': [64, 1], 'update_per_collect': 200, 'batch_size': 128, - }, { - 'type': 'trex', - 'exp_name': 'cartpole_trex_offppo_seed0', - 'min_snippet_length': 5, - 'max_snippet_length': 100, - 'checkpoint_min': 0, - 'checkpoint_max': 6, - 'checkpoint_step': 6, - 'learning_rate': 1e-5, - 'update_per_collect': 1, - 'expert_model_path': 'cartpole_ppo_offpolicy_seed0', - 'hidden_size_list': [512, 64, 1], - 'obs_shape': 4, - 'action_shape': 2, } ] @@ -67,15 +53,9 @@ def test_irl(reward_model_config): expert_data_path = 'expert_data.pkl' state_dict = expert_policy.collect_mode.state_dict() config = deepcopy(cartpole_ppo_offpolicy_config), deepcopy(cartpole_ppo_offpolicy_create_config) - if reward_model_config.type == 'trex': - trex_config = [deepcopy(cartpole_trex_offppo_config), deepcopy(cartpole_trex_offppo_create_config)] - trex_config[0].reward_model = reward_model_config - args = EasyDict({'cfg': deepcopy(trex_config), 'seed': 0, 'device': 'cpu'}) - trex_collecting_data(args=args) - else: - collect_demo_data( - config, seed=0, state_dict=state_dict, expert_data_path=expert_data_path, collect_count=collect_count - ) + collect_demo_data( + config, seed=0, state_dict=state_dict, expert_data_path=expert_data_path, collect_count=collect_count + ) # irl + rl training cp_cartpole_dqn_config = deepcopy(cartpole_dqn_config) cp_cartpole_dqn_create_config = deepcopy(cartpole_dqn_create_config) @@ -88,9 +68,6 @@ def test_irl(reward_model_config): cp_cartpole_dqn_config.policy.collect.n_sample = 128 cooptrain_reward = True pretrain_reward = False - if reward_model_config.type == 'trex': - cooptrain_reward = False - pretrain_reward = True serial_pipeline_reward_model_offpolicy( (cp_cartpole_dqn_config, cp_cartpole_dqn_create_config), seed=0, @@ -126,3 +103,31 @@ def test_ngu(): serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + + +@pytest.mark.unittest +def test_trex(): + exp_name = 'test_serial_pipeline_trex_expert' + config = [deepcopy(cartpole_ppo_offpolicy_config), deepcopy(cartpole_ppo_offpolicy_create_config)] + config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 + config[0].exp_name = exp_name + expert_policy = serial_pipeline(config, seed=0) + + exp_name = 'test_serial_pipeline_trex_collect' + config = [deepcopy(cartpole_trex_offppo_config), deepcopy(cartpole_trex_offppo_create_config)] + config[0].exp_name = exp_name + config[0].reward_model.exp_name = exp_name + config[0].reward_model.expert_model_path = 'test_serial_pipeline_trex_expert' + config[0].reward_model.checkpoint_max = 100 + config[0].reward_model.checkpoint_step = 100 + config[0].reward_model.num_snippets = 100 + args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) + trex_collecting_data(args=args) + try: + serial_pipeline_reward_model_offpolicy( + config, seed=0, max_train_iter=1, pretrain_reward=True, cooptrain_reward=False + ) + except Exception: + assert False, "pipeline fail" + finally: + os.popen('rm -rf test_serial_pipeline_trex*') diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index c1914116ac..414728289b 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -11,7 +11,7 @@ # exploration from .rnd_reward_model import RndRewardModel from .guided_cost_reward_model import GuidedCostRewardModel -from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel +from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel, NGURewardModel from .icm_reward_model import ICMRewardModel from .network import RepresentationNetwork, RNDNetwork, REDNetwork, GAILNetwork, ICMNetwork, GCLNetwork, TREXNetwork from .reword_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward, obs_norm, collect_states diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index d5ff636ba7..f65c284c52 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -1,11 +1,8 @@ from typing import Dict, List import pickle import random -from collections.abc import Iterable -import torch import torch.optim as optim -import torch.nn.functional as F from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel diff --git a/ding/reward_model/tests/test_gail_irl_model.py b/ding/reward_model/tests/test_gail_irl_model.py index 6bf72d68c2..20bf21a7de 100644 --- a/ding/reward_model/tests/test_gail_irl_model.py +++ b/ding/reward_model/tests/test_gail_irl_model.py @@ -28,6 +28,7 @@ learning_rate=1e-3, update_per_collect=2, data_path=expert_data_path_1d, + clear_buffer_per_iters=1, ), cfg2 = dict( @@ -40,6 +41,7 @@ update_per_collect=2, data_path=expert_data_path_3d, action_size=action_space, + clear_buffer_per_iters=1, ), # create fake expert dataset @@ -77,7 +79,7 @@ def test_dataset_1d(cfg): policy.train() train_data_augmented = policy.estimate(data) assert 'reward' in train_data_augmented[0].keys() - policy.clear_data() + policy.clear_data(iter=1) assert len(policy.train_data) == 0 os.popen('rm -rf {}'.format(expert_data_path_1d)) @@ -101,6 +103,6 @@ def test_dataset_3d(cfg): policy.train() train_data_augmented = policy.estimate(data) assert 'reward' in train_data_augmented[0].keys() - policy.clear_data() + policy.clear_data(iter=1) assert len(policy.train_data) == 0 os.popen('rm -rf {}'.format(expert_data_path_3d)) From ff4de477f4bb1f2fe98bb502fea48e9f7509294c Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 06:48:33 -0400 Subject: [PATCH 32/52] fix style --- ding/reward_model/ngu_reward_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index 1c8ebe1139..c5a5e23284 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -5,6 +5,7 @@ import numpy as np import torch import torch.optim as optim +from tensorboardX import SummaryWriter from easydict import EasyDict from ding.utils import RunningMeanStd From 9e63ef12b59a4e82ed90e6486788db5a3e8f33e8 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Thu, 4 May 2023 21:30:09 -0400 Subject: [PATCH 33/52] fix style for drex unittest --- ding/entry/tests/test_serial_entry_drex.py | 46 ++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 ding/entry/tests/test_serial_entry_drex.py diff --git a/ding/entry/tests/test_serial_entry_drex.py b/ding/entry/tests/test_serial_entry_drex.py new file mode 100644 index 0000000000..bded23a97f --- /dev/null +++ b/ding/entry/tests/test_serial_entry_drex.py @@ -0,0 +1,46 @@ +import pytest +import os +from easydict import EasyDict +from copy import deepcopy + +from dizoo.classic_control.cartpole.config.cartpole_dqn_config \ +import cartpole_dqn_config, cartpole_dqn_create_config +from dizoo.classic_control.cartpole.config.cartpole_drex_dqn_config \ +import cartpole_drex_dqn_config, cartpole_drex_dqn_create_config +from ding.entry import serial_pipeline, serial_pipeline_reward_model_offpolicy +from ding.entry.application_entry_drex_collect_data import drex_collecting_data + + +@pytest.mark.unittest +def test_drex(): + exp_name = 'test_serial_pipeline_drex_expert' + config = [deepcopy(cartpole_dqn_config), deepcopy(cartpole_dqn_create_config)] + config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 + config[0].exp_name = exp_name + expert_policy = serial_pipeline(config, seed=0) + + exp_name = 'test_serial_pipeline_drex_collect' + config = [deepcopy(cartpole_drex_dqn_config), deepcopy(cartpole_drex_dqn_create_config)] + config[0].exp_name = exp_name + config[0].reward_model.exp_name = exp_name + config[0].reward_model.expert_model_path = 'test_serial_pipeline_drex_expert/ckpt/ckpt_best.pth.tar' + config[0].reward_model.reward_model_path = 'test_serial_pipeline_drex_collect/cartpole.params' + config[0].reward_model.offline_data_path = 'test_serial_pipeline_drex_collect' + config[0].reward_model.checkpoint_max = 100 + config[0].reward_model.checkpoint_step = 100 + config[0].reward_model.num_snippets = 100 + + args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) + args.cfg[0].policy.collect.n_episode = 8 + del args.cfg[0].policy.collect.n_sample + args.cfg[0].bc_iteration = 1000 # for unittest + args.cfg[1].policy.type = 'bc' + drex_collecting_data(args=args) + try: + serial_pipeline_reward_model_offpolicy( + config, seed=0, max_train_iter=1, pretrain_reward=True, cooptrain_reward=False + ) + except Exception: + assert False, "pipeline fail" + finally: + os.popen('rm -rf test_serial_pipeline_drex*') From ca2e2dbd132e883e44dfcb86437a4e8364e993bb Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 5 May 2023 00:40:58 -0400 Subject: [PATCH 34/52] fix drex unittest --- dizoo/classic_control/cartpole/config/cartpole_dqn_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py index 3e5ca613d0..0e76e2a081 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_dqn_config.py @@ -24,6 +24,7 @@ update_per_collect=5, batch_size=64, learning_rate=0.001, + learner=dict(hook=dict(save_ckpt_after_iter=100)), ), collect=dict(n_sample=8), eval=dict(evaluator=dict(eval_freq=40, )), From 8716afe2ab603a2f431bfea4245447e9d37d49e7 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 5 May 2023 03:25:51 -0400 Subject: [PATCH 35/52] fix bug in minigrid env --- dizoo/minigrid/envs/minigrid_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dizoo/minigrid/envs/minigrid_env.py b/dizoo/minigrid/envs/minigrid_env.py index e0bdbfbc07..12bd64cae0 100644 --- a/dizoo/minigrid/envs/minigrid_env.py +++ b/dizoo/minigrid/envs/minigrid_env.py @@ -60,7 +60,7 @@ def reset(self) -> np.ndarray: self._env = ObsPlusPrevActRewWrapper(self._env) self._init_flag = True if self._flat_obs: - self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dytpe=np.float32) + self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) else: self._observation_space = self._env.observation_space # to be compatiable with subprocess env manager From 903614191af6924267e336b8c5a1b7f956e04a15 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Sat, 6 May 2023 03:29:45 -0400 Subject: [PATCH 36/52] add explain for rm utils --- ding/reward_model/reword_model_utils.py | 37 ++++++++++++++++--- .../config/cartpole_trex_offppo_config.py | 2 + dizoo/mujoco/config/hopper_gail_sac_config.py | 26 ++++++++----- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index 9b6fdefa50..c94dc69062 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -45,6 +45,16 @@ def concat_state_action_pairs( def combine_intrinsic_exterinsic_reward( train_data_augmented: Any, intrinsic_reward: List[torch.Tensor], config: EasyDict ) -> Any: + """ + Overview: + Concatenate intrinsic and extrinsic reward. + Arguments: + - train_data_augmented (:obj:`List`): List with at least ``reward`` keys. + - intrinsic_reward(:obj:`List`): List which each item is the intrinsic reward + - config:(:obj:EasyDict) : self.config which must include intrinsic_reward_type + Returns: + - train_data_augmented (:obj:`List`): List with at least ``reward`` keys. + """ for item, in_rew in zip(train_data_augmented, intrinsic_reward): if config.intrinsic_reward_type == 'add': if config.extrinsic_reward_norm: @@ -62,6 +72,14 @@ def combine_intrinsic_exterinsic_reward( def collect_states(iterator) -> List: + """ + Overview: + collect state from data list(dict) + Arguments: + - iterator(:obj:`List`): List with at least ``obs`` keys. + Returns: + - res (:obj:`List`): List of obs. + """ res = [] for item in iterator: state = item['obs'] @@ -70,12 +88,21 @@ def collect_states(iterator) -> List: def obs_norm( - train_data: torch.Tensor, running_mean_std_rnd_obs: RunningMeanStd, config: EasyDict, device: str + train_data: torch.Tensor, running_mean_std_obs: RunningMeanStd, config: EasyDict, device: str ) -> torch.Tensor: - # Note: observation normalization: transform obs to mean 0, std 1, move norm obs to specific device - running_mean_std_rnd_obs.update(train_data.cpu().numpy()) - train_data = (train_data - to_tensor(running_mean_std_rnd_obs.mean - ).to(device)) / to_tensor(running_mean_std_rnd_obs.std).to(device) + """ + Overview: + transform obs to mean 0, std 1, move norm obs to the specific device + Arguments: + - train_data (:obj:`Tensor`): Tensor of obs + - running_mean_std_obs(:obj:RunningMeanStd): RunningMeanStd for obs + - config:(:obj:EasyDict) : self.config which must include obs_norm_clamp_max, obs_norm_clamp_min + Returns: + - train_data (:obj: Tensor`): Tensor of norm obs + """ + running_mean_std_obs.update(train_data.cpu().numpy()) + train_data = (train_data - to_tensor(running_mean_std_obs.mean).to(device)) / to_tensor(running_mean_std_obs.std + ).to(device) train_data = torch.clamp(train_data, min=config.obs_norm_clamp_min, max=config.obs_norm_clamp_max) return train_data diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index b4d62b9008..8cde57b708 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -18,6 +18,8 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, + num_trajs=0, + num_snippets=6000, expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path hidden_size_list=[512, 64, 1], obs_shape=4, diff --git a/dizoo/mujoco/config/hopper_gail_sac_config.py b/dizoo/mujoco/config/hopper_gail_sac_config.py index b82a075e0c..f0fcdd4515 100644 --- a/dizoo/mujoco/config/hopper_gail_sac_config.py +++ b/dizoo/mujoco/config/hopper_gail_sac_config.py @@ -22,13 +22,13 @@ # Users should add their own model path here. Model path should lead to a model. # Absolute path is recommended. # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - expert_model_path='model_path_placeholder', + expert_model_path='hopper_sac_seed0/ckpt/ckpt_best.pth.tar', # Path where to store the reward model - reward_model_path='data_path_placeholder+/reward_model/ckpt/ckpt_best.pth.tar', + reward_model_path='hopper_gail_sac_seed0/reward_model/ckpt/ckpt_best.pth.tar', # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. # Absolute path is recommended. # In DI-engine, it is usually located in ``exp_name`` directory - data_path='data_path_placeholder', + data_path='hopper_sac_seed0', collect_count=100000, ), policy=dict( @@ -88,13 +88,21 @@ # or you can enter `ding -m serial_gail -c hopper_gail_sac_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. hopper_sac_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.hopper_sac_config import hopper_sac_config, hopper_sac_create_config + # set expert config from policy config in dizoo + expert_cfg = (hopper_sac_config, hopper_sac_create_config) expert_main_config = hopper_sac_config - expert_create_config = hopper_sac_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=1000000, + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, - collect_data=True + state_dict_path=main_config.reward_model.expert_model_path, + expert_data_path=expert_data_path, + collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) From 6b9754ab62b92ab5184b225b7b93bb72c6c46f3e Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Sat, 6 May 2023 03:38:49 -0400 Subject: [PATCH 37/52] move RM unittest into one file --- ding/entry/tests/test_serial_entry_drex.py | 46 ------------------- .../tests/test_serial_entry_reward_model.py | 38 +++++++++++++++ 2 files changed, 38 insertions(+), 46 deletions(-) delete mode 100644 ding/entry/tests/test_serial_entry_drex.py diff --git a/ding/entry/tests/test_serial_entry_drex.py b/ding/entry/tests/test_serial_entry_drex.py deleted file mode 100644 index bded23a97f..0000000000 --- a/ding/entry/tests/test_serial_entry_drex.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -import os -from easydict import EasyDict -from copy import deepcopy - -from dizoo.classic_control.cartpole.config.cartpole_dqn_config \ -import cartpole_dqn_config, cartpole_dqn_create_config -from dizoo.classic_control.cartpole.config.cartpole_drex_dqn_config \ -import cartpole_drex_dqn_config, cartpole_drex_dqn_create_config -from ding.entry import serial_pipeline, serial_pipeline_reward_model_offpolicy -from ding.entry.application_entry_drex_collect_data import drex_collecting_data - - -@pytest.mark.unittest -def test_drex(): - exp_name = 'test_serial_pipeline_drex_expert' - config = [deepcopy(cartpole_dqn_config), deepcopy(cartpole_dqn_create_config)] - config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 - config[0].exp_name = exp_name - expert_policy = serial_pipeline(config, seed=0) - - exp_name = 'test_serial_pipeline_drex_collect' - config = [deepcopy(cartpole_drex_dqn_config), deepcopy(cartpole_drex_dqn_create_config)] - config[0].exp_name = exp_name - config[0].reward_model.exp_name = exp_name - config[0].reward_model.expert_model_path = 'test_serial_pipeline_drex_expert/ckpt/ckpt_best.pth.tar' - config[0].reward_model.reward_model_path = 'test_serial_pipeline_drex_collect/cartpole.params' - config[0].reward_model.offline_data_path = 'test_serial_pipeline_drex_collect' - config[0].reward_model.checkpoint_max = 100 - config[0].reward_model.checkpoint_step = 100 - config[0].reward_model.num_snippets = 100 - - args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) - args.cfg[0].policy.collect.n_episode = 8 - del args.cfg[0].policy.collect.n_sample - args.cfg[0].bc_iteration = 1000 # for unittest - args.cfg[1].policy.type = 'bc' - drex_collecting_data(args=args) - try: - serial_pipeline_reward_model_offpolicy( - config, seed=0, max_train_iter=1, pretrain_reward=True, cooptrain_reward=False - ) - except Exception: - assert False, "pipeline fail" - finally: - os.popen('rm -rf test_serial_pipeline_drex*') diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 875602c204..1548bcdf9e 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -7,6 +7,8 @@ from dizoo.classic_control.cartpole.config.cartpole_dqn_config import cartpole_dqn_config, cartpole_dqn_create_config from dizoo.classic_control.cartpole.config.cartpole_trex_offppo_config import cartpole_trex_offppo_config,\ cartpole_trex_offppo_create_config +from dizoo.classic_control.cartpole.config.cartpole_drex_dqn_config import cartpole_drex_dqn_config, \ + cartpole_drex_dqn_create_config from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_rnd_onppo_config import cartpole_ppo_rnd_config, cartpole_ppo_rnd_create_config # noqa from dizoo.classic_control.cartpole.config.cartpole_ppo_icm_config import cartpole_ppo_icm_config, cartpole_ppo_icm_create_config # noqa @@ -14,6 +16,7 @@ from ding.entry import serial_pipeline, collect_demo_data, serial_pipeline_reward_model_offpolicy, \ serial_pipeline_reward_model_onpolicy from ding.entry.application_entry_trex_collect_data import trex_collecting_data +from ding.entry.application_entry_drex_collect_data import drex_collecting_data cfg = [ { @@ -131,3 +134,38 @@ def test_trex(): assert False, "pipeline fail" finally: os.popen('rm -rf test_serial_pipeline_trex*') + + +@pytest.mark.unittest +def test_drex(): + exp_name = 'test_serial_pipeline_drex_expert' + config = [deepcopy(cartpole_dqn_config), deepcopy(cartpole_dqn_create_config)] + config[0].policy.learn.learner.hook.save_ckpt_after_iter = 100 + config[0].exp_name = exp_name + expert_policy = serial_pipeline(config, seed=0) + + exp_name = 'test_serial_pipeline_drex_collect' + config = [deepcopy(cartpole_drex_dqn_config), deepcopy(cartpole_drex_dqn_create_config)] + config[0].exp_name = exp_name + config[0].reward_model.exp_name = exp_name + config[0].reward_model.expert_model_path = 'test_serial_pipeline_drex_expert/ckpt/ckpt_best.pth.tar' + config[0].reward_model.reward_model_path = 'test_serial_pipeline_drex_collect/cartpole.params' + config[0].reward_model.offline_data_path = 'test_serial_pipeline_drex_collect' + config[0].reward_model.checkpoint_max = 100 + config[0].reward_model.checkpoint_step = 100 + config[0].reward_model.num_snippets = 100 + + args = EasyDict({'cfg': deepcopy(config), 'seed': 0, 'device': 'cpu'}) + args.cfg[0].policy.collect.n_episode = 8 + del args.cfg[0].policy.collect.n_sample + args.cfg[0].bc_iteration = 1000 # for unittest + args.cfg[1].policy.type = 'bc' + drex_collecting_data(args=args) + try: + serial_pipeline_reward_model_offpolicy( + config, seed=0, max_train_iter=1, pretrain_reward=True, cooptrain_reward=False + ) + except Exception: + assert False, "pipeline fail" + finally: + os.popen('rm -rf test_serial_pipeline_drex*') From a5c798969c65731cdf4487004061632111cf20a7 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 8 May 2023 14:53:42 -0400 Subject: [PATCH 38/52] add drex config --- .../config/lunarlander_drex_dqn_config.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py diff --git a/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py new file mode 100644 index 0000000000..7f23439e45 --- /dev/null +++ b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py @@ -0,0 +1,119 @@ +from easydict import EasyDict + +nstep = 1 +lunarlander_drex_dqn_config = dict( + exp_name='lunarlander_drex_dqn_seed0', + env=dict( + # Whether to use shared memory. Only effective if "env_manager_type" is 'subprocess' + # Env number respectively for collector and evaluator. + collector_env_num=8, + evaluator_env_num=8, + env_id='LunarLander-v2', + n_evaluator_episode=8, + stop_value=200, + ), + reward_model=dict( + type='drex', + exp_name='lunarlander_drex_dqn_seed0', + min_snippet_length=30, + max_snippet_length=100, + checkpoint_min=1000, + checkpoint_max=9000, + checkpoint_step=1000, + num_snippets=60000, + num_trajs_per_bin=20, + num_trajs=6, + bc_iterations=6000, + learning_rate=1e-5, + update_per_collect=1, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + expert_model_path='lunarlander_dqn_seed0/ckpt/ckpt_best.pth.tar', + reward_model_path='lunarlander_dqn_seed0/cartpole.params', + offline_data_path='lunarlander_drex_dqn_seed0', + hidden_size_list=[512, 64, 1], + obs_shape=8, + action_shape=4, + eps_list=[0, 0.5, 1], + ), + policy=dict( + # Whether to use cuda for network. + cuda=False, + model=dict( + obs_shape=8, + action_shape=4, + encoder_hidden_size_list=[512, 64], + # Whether to use dueling head. + dueling=True, + ), + # Reward's future discount factor, aka. gamma. + discount_factor=0.99, + # How many steps in td error. + nstep=nstep, + # learn_mode config + learn=dict( + update_per_collect=10, + batch_size=64, + learning_rate=0.001, + # Frequency of target network update. + target_update_freq=100, + ), + # collect_mode config + collect=dict( + # You can use either "n_sample" or "n_episode" in collector.collect. + # Get "n_sample" samples per collect. + n_sample=64, + # Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + collector=dict(get_train_sample=False, reward_shaping=False,), + ), + # command_mode config + other=dict( + # Epsilon greedy with decay. + eps=dict( + # Decay type. Support ['exp', 'linear']. + type='exp', + start=0.95, + end=0.1, + decay=50000, + ), + replay_buffer=dict(replay_buffer_size=100000, ) + ), + ), +) +lunarlander_drex_dqn_config = EasyDict(lunarlander_drex_dqn_config) +main_config = lunarlander_drex_dqn_config + +lunarlander_drex_dqn_create_config = dict( + env=dict( + type='lunarlander', + import_names=['dizoo.box2d.lunarlander.envs.lunarlander_env'], + ), + env_manager=dict(type='base'), + policy=dict(type='dqn'), + reward_model=dict(type='drex'), +) +lunarlander_drex_dqn_create_config = EasyDict(lunarlander_drex_dqn_create_config) +create_config = lunarlander_drex_dqn_create_config + +if __name__ == '__main__': + # Users should first run ``lunarlander_dqn_config.py`` to save models (or checkpoints). + # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step + # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. + import argparse + import torch + from ding.config import read_config + from ding.entry import drex_collecting_data + from ding.entry import serial_pipeline_reward_model_offpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + args.cfg = read_config(args.cfg) + args.cfg[1].policy.type = 'bc' + args.cfg[0].policy.collect.n_episode = 64 + del args.cfg[0].policy.collect.n_sample + drex_collecting_data(args) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False, max_env_step=int(1e7)) From f42d1313384116bfccc5f9a9f12da53f05318b60 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 9 May 2023 04:17:51 -0400 Subject: [PATCH 39/52] fix ngu wrapper bug in minigrid --- dizoo/minigrid/config/minigrid_ngu_config.py | 14 ++-- dizoo/minigrid/envs/minigrid_env.py | 3 +- dizoo/minigrid/envs/minigrid_wrapper.py | 70 ++++++++++++++++++++ dizoo/minigrid/utils/eval.py | 10 +-- 4 files changed, 83 insertions(+), 14 deletions(-) diff --git a/dizoo/minigrid/config/minigrid_ngu_config.py b/dizoo/minigrid/config/minigrid_ngu_config.py index 0369da1924..e2b0dd62af 100644 --- a/dizoo/minigrid/config/minigrid_ngu_config.py +++ b/dizoo/minigrid/config/minigrid_ngu_config.py @@ -4,7 +4,7 @@ evaluator_env_num = 8 nstep = 5 minigrid_ppo_ngu_config = dict( - exp_name='minigrid_doorkey_ngu_seed0', + exp_name='minigrid_fourroom_ngu_seed0', env=dict( collector_env_num=collector_env_num, evaluator_env_num=evaluator_env_num, @@ -12,7 +12,7 @@ # typical MiniGrid env id: # {'MiniGrid-Empty-8x8-v0', 'MiniGrid-FourRooms-v0', 'MiniGrid-DoorKey-8x8-v0','MiniGrid-DoorKey-16x16-v0'}, # please refer to https://github.com/Farama-Foundation/MiniGrid for details. - env_id='MiniGrid-DoorKey-16x16-v0', + env_id='MiniGrid-FourRooms-v0', obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy max_step=300, stop_value=0.96, @@ -50,7 +50,7 @@ last_nonzero_reward_weight=100, intrinsic_reward_type='add', learning_rate=5e-4, - obs_shape=2739, + obs_shape=2835, action_shape=7, batch_size=320, # transitions update_per_collect=10, # 32*100/64=50 @@ -73,7 +73,7 @@ # i.e., = = + learn_unroll_len=298, # set this key according to the episode length model=dict( - obs_shape=2739, + obs_shape=2835, action_shape=7, encoder_hidden_size_list=[128, 128, 512], collector_env_num=collector_env_num, @@ -91,7 +91,7 @@ # we want to collect data of length self._traj_len=INF # unless the episode enters the 'done' state. # In each collect phase, we collect a total of sequence samples. - n_sample=32, + n_sample=64, traj_len_inf=True, env_num=collector_env_num, ), @@ -120,7 +120,7 @@ type='minigrid', import_names=['dizoo.minigrid.envs.minigrid_env'], ), - env_manager=dict(type='subprocess'), + env_manager=dict(type='base'), policy=dict(type='ngu'), reward_model=dict(type='ngu-reward'), ) @@ -130,4 +130,4 @@ if __name__ == "__main__": # or you can enter `ding -m serial_ngu -c minigrid_ngu_config.py -s 0` from ding.entry import serial_pipeline_reward_model_offpolicy - serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) + serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0, max_env_step=int(1e7)) diff --git a/dizoo/minigrid/envs/minigrid_env.py b/dizoo/minigrid/envs/minigrid_env.py index 12bd64cae0..2e4b4e0180 100644 --- a/dizoo/minigrid/envs/minigrid_env.py +++ b/dizoo/minigrid/envs/minigrid_env.py @@ -10,8 +10,7 @@ from matplotlib import animation import matplotlib.pyplot as plt from minigrid.wrappers import FlatObsWrapper, RGBImgPartialObsWrapper, ImgObsWrapper -from .minigrid_wrapper import ViewSizeWrapper -from ding.envs import ObsPlusPrevActRewWrapper +from .minigrid_wrapper import ViewSizeWrapper, ObsPlusPrevActRewWrapper from ding.envs import BaseEnv, BaseEnvTimestep from ding.torch_utils import to_ndarray, to_list diff --git a/dizoo/minigrid/envs/minigrid_wrapper.py b/dizoo/minigrid/envs/minigrid_wrapper.py index 09a14c9c81..02af761f00 100644 --- a/dizoo/minigrid/envs/minigrid_wrapper.py +++ b/dizoo/minigrid/envs/minigrid_wrapper.py @@ -1,4 +1,5 @@ import gymnasium as gym +import numpy as np from gymnasium import spaces from gymnasium.core import ObservationWrapper @@ -32,3 +33,72 @@ def observation(self, obs): # print('vis_mask:' + vis_mask) image = grid.encode(vis_mask) return {**obs, "image": image} + + +class ObsPlusPrevActRewWrapper(gym.Wrapper): + """ + Overview: + This wrapper is used in policy NGU. + Set a dict {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} + as the new wrapped observation, + which including the current obs, previous action and previous reward. + Interface: + ``__init__``, ``reset``, ``step`` + Properties: + - env (:obj:`gymnasium.Env`): the environment to wrap. + """ + + def __init__(self, env): + """ + Overview: + Initialize ``self.`` See ``help(type(self))`` for accurate signature; setup the properties. + Arguments: + - env (:obj:`gymnasium.Env`): the environment to wrap. + """ + super().__init__(env) + self.observation_space = gym.spaces.Dict( + { + 'obs': env.observation_space, + 'prev_action': env.action_space, + 'prev_reward_extrinsic': gym.spaces.Box( + low=env.reward_range[0], high=env.reward_range[1], shape=(1, ), dtype=np.float32 + ) + } + ) + self.prev_action = -1 # null action + self.prev_reward_extrinsic = 0 # null reward + + def reset(self, *, seed: int = None): + """ + Overview: + Resets the state of the environment. + Returns: + - obs (:obj:`Dict`) : the wrapped observation, which including the current obs, \ + previous action and previous reward. + """ + obs, info = self.env.reset(seed=seed) + obs = {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} + return obs, info + + def step(self, action): + """ + Overview: + Step the environment with the given action. + Save the previous action and reward to be used in next new obs + Arguments: + - action (:obj:`Any`): the given action to step with. + Returns: + - obs (:obj:`Dict`) : the wrapped observation, which including the current obs, \ + previous action and previous reward. + - reward (:obj:`Any`) : amount of reward returned after previous action + - done (:obj:`Bool`) : whether the episode has ended, in which case further \ + step() calls will return undefined results + - info (:obj:`Dict`) : contains auxiliary diagnostic information (helpful \ + for debugging, and sometimes learning) + """ + + obs, reward, done, truncated, info = self.env.step(action) + obs = {'obs': obs, 'prev_action': self.prev_action, 'prev_reward_extrinsic': self.prev_reward_extrinsic} + self.prev_action = action + self.prev_reward_extrinsic = reward + return obs, reward, done, truncated, info diff --git a/dizoo/minigrid/utils/eval.py b/dizoo/minigrid/utils/eval.py index e8e4f728fa..e3c6acb9fb 100644 --- a/dizoo/minigrid/utils/eval.py +++ b/dizoo/minigrid/utils/eval.py @@ -8,11 +8,11 @@ def eval( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - model: Optional[torch.nn.Module] = None, - state_dict: Optional[dict] = None, - replay_path: Optional[str] = './video', + input_cfg: Union[str, Tuple[dict, dict]], + seed: int = 0, + model: Optional[torch.nn.Module] = None, + state_dict: Optional[dict] = None, + replay_path: Optional[str] = './video', ) -> float: r""" Overview: From edff260975cb9293435e5799dd60ef13ff0a9d38 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 9 May 2023 05:54:21 -0400 Subject: [PATCH 40/52] fix ngu wrapper bug in minigrid --- dizoo/minigrid/envs/minigrid_env.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dizoo/minigrid/envs/minigrid_env.py b/dizoo/minigrid/envs/minigrid_env.py index 2e4b4e0180..b4b4aaecb0 100644 --- a/dizoo/minigrid/envs/minigrid_env.py +++ b/dizoo/minigrid/envs/minigrid_env.py @@ -59,7 +59,12 @@ def reset(self) -> np.ndarray: self._env = ObsPlusPrevActRewWrapper(self._env) self._init_flag = True if self._flat_obs: - self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) + if type(self._env.observation_space) == gym.spaces.Dict: + obs_space = self._env.observation_space + obs_space['obs'] = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) + self._observation_space = obs_space + else: + self._observation_space = gym.spaces.Box(0, 1, shape=(2835, ), dtype=np.float32) else: self._observation_space = self._env.observation_space # to be compatiable with subprocess env manager From cb0c6279e29fdbd18d613ed904e74faae903fab0 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 22 May 2023 01:48:02 -0400 Subject: [PATCH 41/52] refactor gcl, add it to reward entry --- ding/entry/serial_entry_guided_cost.py | 162 ------------------ .../tests/test_serial_entry_guided_cost.py | 23 --- .../tests/test_serial_entry_reward_model.py | 8 +- ding/reward_model/guided_cost_reward_model.py | 64 ++++++- .../cartpole/config/__init__.py | 2 +- .../cartpole/config/cartpole_gcl_config.py | 60 ++++--- .../config/cartpole_trex_dqn_config.py | 4 +- .../config/cartpole_trex_onppo_config.py | 3 +- 8 files changed, 105 insertions(+), 221 deletions(-) delete mode 100644 ding/entry/serial_entry_guided_cost.py delete mode 100644 ding/entry/tests/test_serial_entry_guided_cost.py diff --git a/ding/entry/serial_entry_guided_cost.py b/ding/entry/serial_entry_guided_cost.py deleted file mode 100644 index a66f4535a2..0000000000 --- a/ding/entry/serial_entry_guided_cost.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import Union, Optional, List, Any, Tuple -import os -import copy -import torch -from ditk import logging -from functools import partial -from tensorboardX import SummaryWriter -from copy import deepcopy - -from ding.envs import get_vec_env_setting, create_env_manager -from ding.worker import BaseLearner, InteractionSerialEvaluator, BaseSerialCommander, create_buffer, \ - create_serial_collector -from ding.config import read_config, compile_config -from ding.policy import create_policy -from ding.reward_model import create_reward_model -from ding.utils import set_pkg_seed, save_file -from .utils import random_collect - - -def serial_pipeline_guided_cost( - input_cfg: Union[str, Tuple[dict, dict]], - seed: int = 0, - env_setting: Optional[List[Any]] = None, - model: Optional[torch.nn.Module] = None, - expert_model: Optional[torch.nn.Module] = None, - max_train_iter: Optional[int] = int(1e10), - max_env_step: Optional[int] = int(1e10), -) -> 'Policy': # noqa - """ - Overview: - Serial pipeline guided cost: we create this serial pipeline in order to\ - implement guided cost learning in DI-engine. For now, we support the following envs\ - Cartpole, Lunarlander, Hopper, Halfcheetah, Walker2d. The demonstration\ - data come from the expert model. We use a well-trained model to \ - generate demonstration data online - Arguments: - - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \ - ``str`` type means config file path. \ - ``Tuple[dict, dict]`` type means [user_config, create_cfg]. - - seed (:obj:`int`): Random seed. - - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \ - ``BaseEnv`` subclass, collector env config, and evaluator env config. - - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - - expert_model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module.\ - The default model is DQN(**cfg.policy.model) - - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. - Returns: - - policy (:obj:`Policy`): Converged policy. - """ - if isinstance(input_cfg, str): - cfg, create_cfg = read_config(input_cfg) - else: - cfg, create_cfg = deepcopy(input_cfg) - create_cfg.policy.type = create_cfg.policy.type + '_command' - env_fn = None if env_setting is None else env_setting[0] - cfg = compile_config(cfg, seed=seed, env=env_fn, auto=True, create_cfg=create_cfg, save_cfg=True) - # Create main components: env, policy - if env_setting is None: - env_fn, collector_env_cfg, evaluator_env_cfg = get_vec_env_setting(cfg.env) - else: - env_fn, collector_env_cfg, evaluator_env_cfg = env_setting - collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - expert_collector_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in collector_env_cfg]) - evaluator_env = create_env_manager(cfg.env.manager, [partial(env_fn, cfg=c) for c in evaluator_env_cfg]) - expert_collector_env.seed(cfg.seed) - collector_env.seed(cfg.seed) - evaluator_env.seed(cfg.seed, dynamic_seed=False) - expert_policy = create_policy(cfg.policy, model=expert_model, enable_field=['learn', 'collect']) - set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) - policy = create_policy(cfg.policy, model=model, enable_field=['learn', 'collect', 'eval', 'command']) - expert_policy.collect_mode.load_state_dict(torch.load(cfg.policy.collect.model_path, map_location='cpu')) - # Create worker components: learner, collector, evaluator, replay buffer, commander. - tb_logger = SummaryWriter(os.path.join('./{}/log/'.format(cfg.exp_name), 'serial')) - learner = BaseLearner(cfg.policy.learn.learner, policy.learn_mode, tb_logger, exp_name=cfg.exp_name) - collector = create_serial_collector( - cfg.policy.collect.collector, - env=collector_env, - policy=policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - expert_collector = create_serial_collector( - cfg.policy.collect.collector, - env=expert_collector_env, - policy=expert_policy.collect_mode, - tb_logger=tb_logger, - exp_name=cfg.exp_name - ) - evaluator = InteractionSerialEvaluator( - cfg.policy.eval.evaluator, evaluator_env, policy.eval_mode, tb_logger, exp_name=cfg.exp_name - ) - replay_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - expert_buffer = create_buffer(cfg.policy.other.replay_buffer, tb_logger=tb_logger, exp_name=cfg.exp_name) - commander = BaseSerialCommander( - cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode - ) - - reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - # ========== - # Main loop - # ========== - # Learner's before_run hook. - learner.call_hook('before_run') - - # Accumulate plenty of data at the beginning of training. - if cfg.policy.get('random_collect_size', 0) > 0: - random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - dirname = cfg.exp_name + '/reward_model' - if not os.path.exists(dirname): - try: - os.makedirs(dirname) - except FileExistsError: - pass - while True: - collect_kwargs = commander.step() - # Evaluate policy performance - if evaluator.should_eval(learner.train_iter): - stop, reward = evaluator.eval(learner.save_checkpoint, learner.train_iter, collector.envstep) - if stop: - break - # Collect data by default config n_sample/n_episode - new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - # NOTE: deepcopy data is very important, - # otherwise the data in the replay buffer will be incorrectly modified. - # NOTE: this line cannot move to line130, because in line134 the data may be modified in-place. - train_data = copy.deepcopy(new_data) - expert_data = expert_collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) - replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) - expert_buffer.push(expert_data, cur_collector_envstep=expert_collector.envstep) - # Learn policy from collected data - for i in range(cfg.reward_model.update_per_collect): - expert_demo = expert_buffer.sample(cfg.reward_model.batch_size, learner.train_iter) - samp = replay_buffer.sample(cfg.reward_model.batch_size, learner.train_iter) - reward_model.train(expert_demo, samp, learner.train_iter, collector.envstep) - for i in range(cfg.policy.learn.update_per_collect): - # Learner will train ``update_per_collect`` times in one iteration. - _ = reward_model.estimate(train_data) - if train_data is None: - # It is possible that replay buffer's data count is too few to train ``update_per_collect`` times - logging.warning( - "Replay buffer's data can only train for {} steps. ".format(i) + - "You can modify data collect config, e.g. increasing n_sample, n_episode." - ) - break - learner.train(train_data, collector.envstep) - if learner.policy.get_attribute('priority'): - replay_buffer.update(learner.priority_info) - if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: - break - # save reward model - if learner.train_iter % cfg.reward_model.store_model_every_n_train == 0: - #if learner.train_iter%5000 == 0: - path = os.path.join(dirname, 'iteration_{}.pth.tar'.format(learner.train_iter)) - state_dict = reward_model.state_dict_reward_model() - save_file(path, state_dict) - path = os.path.join(dirname, 'final_model.pth.tar') - state_dict = reward_model.state_dict_reward_model() - save_file(path, state_dict) - # Learner's after_run hook. - learner.call_hook('after_run') - return policy diff --git a/ding/entry/tests/test_serial_entry_guided_cost.py b/ding/entry/tests/test_serial_entry_guided_cost.py deleted file mode 100644 index 33742d4fb8..0000000000 --- a/ding/entry/tests/test_serial_entry_guided_cost.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -import torch -from copy import deepcopy -from ding.entry import serial_pipeline_onpolicy, serial_pipeline_guided_cost -from dizoo.classic_control.cartpole.config import cartpole_ppo_config, cartpole_ppo_create_config -from dizoo.classic_control.cartpole.config import cartpole_gcl_ppo_onpolicy_config, \ - cartpole_gcl_ppo_onpolicy_create_config - - -@pytest.mark.unittest -def test_guided_cost(): - expert_policy_state_dict_path = './expert_policy.pth' - config = [deepcopy(cartpole_ppo_config), deepcopy(cartpole_ppo_create_config)] - expert_policy = serial_pipeline_onpolicy(config, seed=0) - torch.save(expert_policy.collect_mode.state_dict(), expert_policy_state_dict_path) - - config = [deepcopy(cartpole_gcl_ppo_onpolicy_config), deepcopy(cartpole_gcl_ppo_onpolicy_create_config)] - config[0].policy.collect.model_path = expert_policy_state_dict_path - config[0].policy.learn.update_per_collect = 1 - try: - serial_pipeline_guided_cost(config, seed=0, max_train_iter=1) - except Exception: - assert False, "pipeline fail" diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 1548bcdf9e..4dc3cd91fe 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -60,8 +60,12 @@ def test_irl(reward_model_config): config, seed=0, state_dict=state_dict, expert_data_path=expert_data_path, collect_count=collect_count ) # irl + rl training - cp_cartpole_dqn_config = deepcopy(cartpole_dqn_config) - cp_cartpole_dqn_create_config = deepcopy(cartpole_dqn_create_config) + if reward_model_config.type == 'guided_cost': + cp_cartpole_dqn_config = deepcopy(cartpole_ppo_offpolicy_config) + cp_cartpole_dqn_create_config = deepcopy(cartpole_ppo_offpolicy_create_config) + else: + cp_cartpole_dqn_config = deepcopy(cartpole_dqn_config) + cp_cartpole_dqn_create_config = deepcopy(cartpole_dqn_create_config) cp_cartpole_dqn_create_config.reward_model = dict(type=reward_model_config.type) if reward_model_config.type == 'gail': reward_model_config['data_path'] = '.' diff --git a/ding/reward_model/guided_cost_reward_model.py b/ding/reward_model/guided_cost_reward_model.py index 8121b30c63..15e95fd078 100644 --- a/ding/reward_model/guided_cost_reward_model.py +++ b/ding/reward_model/guided_cost_reward_model.py @@ -1,6 +1,9 @@ from typing import List, Dict, Any from easydict import EasyDict +from ditk import logging +import pickle +import random import torch import torch.nn as nn import torch.optim as optim @@ -72,6 +75,7 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger + self.iter = 0 self.reward_model = GCLNetwork( config.input_size, [config.hidden_size, config.hidden_size], output_size=1, @@ -79,8 +83,39 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> ) self.reward_model.to(self.device) self.opt = optim.Adam(self.reward_model.parameters(), lr=config.learning_rate) + self.train_data = [] + self.load_expert_data() - def train(self, expert_demo: torch.Tensor, samp: torch.Tensor, iter, step): + def load_expert_data(self) -> None: + """ + Overview: + Getting the expert data from ``config['expert_data_path']`` attribute in self. + Effects: + This is a side effect function which updates the expert data attribute (e.g. ``self.expert_data``) + """ + with open(self.cfg.expert_data_path, 'rb') as f: + self.expert_data = pickle.load(f) + + def train(self) -> None: + """ + Overview: + Train the reward model. + """ + # sample data for expert and train data + sample_size = min(len(self.expert_data), self.cfg.batch_size) + expert_demo = random.sample(self.expert_data, sample_size) + samp = random.sample(self.train_data, sample_size) + + # remove non-tensor data in data list + samp = self._remove_redundant_keys(samp) + + # train the reward model + for _ in range(self.cfg.update_per_collect): + loss_ioc = self._train(expert_demo, samp) + self.tb_logger.add_scalar('reward_model/loss_iter', loss_ioc, self.iter) + self.iter += 1 + + def _train(self, expert_demo: torch.Tensor, samp: torch.Tensor) -> float: device_0 = expert_demo[0]['obs'].device device_1 = samp[0]['obs'].device for i in range(len(expert_demo)): @@ -106,9 +141,8 @@ def train(self, expert_demo: torch.Tensor, samp: torch.Tensor, iter, step): self.opt.zero_grad() loss_IOC.backward() self.opt.step() - if iter % self.cfg.log_every_n_train == 0: - self.tb_logger.add_scalar('reward_model/loss_iter', loss_IOC, iter) - self.tb_logger.add_scalar('reward_model/loss_step', loss_IOC, step) + + return loss_IOC.item() def estimate(self, data: list) -> List[Dict]: # NOTE: this estimate method of gcl alg. is a little different from the one in other irl alg., @@ -130,7 +164,7 @@ def collect_data(self, data) -> None: if online_net is trained continuously, there should be some implementations in collect_data method """ # if online_net is trained continuously, there should be some implementations in collect_data method - pass + self.train_data.extend(data) def clear_data(self, iter: int): """ @@ -150,3 +184,23 @@ def state_dict_reward_model(self) -> Dict[str, Any]: def load_state_dict_reward_model(self, state_dict: Dict[str, Any]) -> None: self.reward_model.load_state_dict(state_dict['model']) self.opt.load_state_dict(state_dict['optimizer']) + + def _remove_redundant_keys(self, samp: List[Dict]) -> List[Dict]: + """ + Overview: + Remove redundant keys in the data list. + Arguments: + - samp (:obj:`List[Dict]`): The data list. + Returns: + - (:obj:`List[Dict]`): The data list without redundant keys. + """ + keeped_keys = ['obs', 'next_obs', 'action', 'logit'] + assert samp is not None and bool(samp), "samp is empty." + assert all(key in samp[0] for key in keeped_keys), "samp is missing required keys." + fixed_samp = [] + for item in samp: + fixed_item = {} + for key in keeped_keys: + fixed_item[key] = item[key] + fixed_samp.append(fixed_item) + return fixed_samp diff --git a/dizoo/classic_control/cartpole/config/__init__.py b/dizoo/classic_control/cartpole/config/__init__.py index 3d6d124274..0891fdb75d 100644 --- a/dizoo/classic_control/cartpole/config/__init__.py +++ b/dizoo/classic_control/cartpole/config/__init__.py @@ -4,7 +4,7 @@ from .cartpole_dqfd_config import cartpole_dqfd_config, cartpole_dqfd_create_config from .cartpole_dqn_config import cartpole_dqn_config, cartpole_dqn_create_config from .cartpole_dqn_gail_config import cartpole_dqn_gail_config, cartpole_dqn_gail_create_config -from .cartpole_gcl_config import cartpole_gcl_ppo_onpolicy_config, cartpole_gcl_ppo_onpolicy_create_config +from .cartpole_gcl_config import cartpole_gcl_ppo_offpolicy_config, cartpole_gcl_ppo_offpolicy_create_config from .cartpole_impala_config import cartpole_impala_config, cartpole_impala_create_config from .cartpole_iqn_config import cartpole_iqn_config, cartpole_iqn_create_config from .cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config diff --git a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py index c4c8faf083..a6ea193ef1 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py @@ -1,7 +1,7 @@ from easydict import EasyDict -cartpole_gcl_ppo_onpolicy_config = dict( - exp_name='cartpole_gcl_seed0', +cartpole_gcl_ppo_offpolicy_config = dict( + exp_name='cartpole_gcl_offpolicy_seed0', env=dict( collector_env_num=8, evaluator_env_num=5, @@ -13,56 +13,68 @@ input_size=5, batch_size=32, continuous=False, + expert_data_path='cartpole_ppo_offpolicy_seed0/expert_data.pkl', + expert_model_path='cartpole_ppo_offpolicy_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=10, + collect_count=1000, ), policy=dict( cuda=False, - recompute_adv=True, - action_space='discrete', model=dict( obs_shape=4, action_shape=2, - action_space='discrete', encoder_hidden_size_list=[64, 64, 128], critic_head_hidden_size=128, actor_head_hidden_size=128, + action_space='discrete', ), learn=dict( - update_per_collect=2, + update_per_collect=6, batch_size=64, learning_rate=0.001, + value_weight=0.5, entropy_weight=0.01, + clip_ratio=0.2, + learner=dict(hook=dict(save_ckpt_after_iter=1000)), ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. - collector_logit=True, # add logit into collected transition - n_sample=256, + n_sample=128, + unroll_len=1, discount_factor=0.9, gae_lambda=0.95, ), - eval=dict(evaluator=dict(eval_freq=50, ), ), + eval=dict(evaluator=dict(eval_freq=40, )), + other=dict(replay_buffer=dict(replay_buffer_size=5000)) ), ) -cartpole_gcl_ppo_onpolicy_config = EasyDict(cartpole_gcl_ppo_onpolicy_config) -main_config = cartpole_gcl_ppo_onpolicy_config -cartpole_gcl_ppo_onpolicy_create_config = dict( +cartpole_gcl_ppo_offpolicy_config = EasyDict(cartpole_gcl_ppo_offpolicy_config) +main_config = cartpole_gcl_ppo_offpolicy_config +cartpole_gcl_ppo_offpolicy_create_config = dict( env=dict( type='cartpole', import_names=['dizoo.classic_control.cartpole.envs.cartpole_env'], ), env_manager=dict(type='base'), - policy=dict(type='ppo'), + policy=dict(type='ppo_offpolicy'), reward_model=dict(type='guided_cost'), ) -cartpole_gcl_ppo_onpolicy_create_config = EasyDict(cartpole_gcl_ppo_onpolicy_create_config) -create_config = cartpole_gcl_ppo_onpolicy_create_config +cartpole_gcl_ppo_offpolicy_create_config = EasyDict(cartpole_gcl_ppo_offpolicy_create_config) +create_config = cartpole_gcl_ppo_offpolicy_create_config if __name__ == "__main__": - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + # or you can enter `ding -m serial -c cartpole_ppo_offpolicy_config.py -s 0` + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.classic_control.cartpole.config.cartpole_ppo_offpolicy_config import cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config + + expert_cfg = (cartpole_ppo_offpolicy_config, cartpole_ppo_offpolicy_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py index c2b1cf1943..9e6f20eaf3 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py @@ -72,7 +72,7 @@ # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. # example of running this file: - # python cartpole_trex_dqn_config.py --cfg cartpole_trex_dqn_config.py --seed 0 --device cpu + # python cartpole_trex_dqn_config.py --cfg cartpole_trex_dqn_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data @@ -84,4 +84,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config),pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 30608a9053..3faf422ead 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -70,7 +70,7 @@ # Note: Users should check that the checkpoints generated should include iteration_'checkpoint_min'.pth.tar, iteration_'checkpoint_max'.pth.tar with the interval checkpoint_step # where checkpoint_max, checkpoint_min, checkpoint_step are specified above. # example of running this file: - # python cartpole_trex_onppo_config.py --cfg cartpole_trex_onppo_config.py --seed 0 --device cpu + # python cartpole_trex_onppo_config.py --cfg cartpole_trex_onppo_config.py --seed 0 --device cpu import argparse import torch from ding.entry import trex_collecting_data @@ -83,4 +83,3 @@ # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) - From 016fbb3b2f2e845d068663196ed294cd3a9154ea Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 22 May 2023 02:03:09 -0400 Subject: [PATCH 42/52] refactor gcl config and bash format other config --- .../config/lunarlander_cont_sac_config.py | 4 +-- .../config/lunarlander_drex_dqn_config.py | 9 ++++-- .../config/lunarlander_gcl_config.py | 28 +++++++++++++------ dizoo/mujoco/config/halfcheetah_bdq_config.py | 7 +++-- .../config/halfcheetah_gcl_sac_config.py | 27 ++++++++++++------ .../config/halfcheetah_trex_sac_config.py | 2 +- dizoo/mujoco/config/hopper_bdq_config.py | 6 +++- dizoo/mujoco/config/hopper_gcl_config.py | 27 ++++++++++++------ dizoo/mujoco/config/hopper_trex_sac_config.py | 2 +- dizoo/mujoco/config/walker2d_gcl_config.py | 27 ++++++++++++------ .../config/walker2d_trex_onppo_config.py | 2 +- .../mujoco/config/walker2d_trex_sac_config.py | 2 +- 12 files changed, 95 insertions(+), 48 deletions(-) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py index 0e60fce608..f8a8ab47e7 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_cont_sac_config.py @@ -28,9 +28,7 @@ learning_rate_alpha=3e-4, auto_alpha=True, ), - collect=dict( - n_sample=256, - ), + collect=dict(n_sample=256, ), eval=dict(evaluator=dict(eval_freq=1000, ), ), other=dict(replay_buffer=dict(replay_buffer_size=int(1e5), ), ), ), diff --git a/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py index 7f23439e45..17e98559da 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py @@ -66,7 +66,10 @@ n_sample=64, # Cut trajectories into pieces with length "unroll_len". unroll_len=1, - collector=dict(get_train_sample=False, reward_shaping=False,), + collector=dict( + get_train_sample=False, + reward_shaping=False, + ), ), # command_mode config other=dict( @@ -116,4 +119,6 @@ args.cfg[0].policy.collect.n_episode = 64 del args.cfg[0].policy.collect.n_sample drex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False, max_env_step=int(1e7)) + serial_pipeline_reward_model_offpolicy( + (main_config, create_config), pretrain_reward=True, cooptrain_reward=False, max_env_step=int(1e7) + ) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py index 60065ae33b..85932f9928 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py @@ -15,6 +15,9 @@ batch_size=32, continuous=False, update_per_collect=20, + expert_data_path='lunarlander_ppo_offpolicy_seed0/expert_data.pkl', + expert_model_path='lunarlander_ppo_offpolicy_seed0/ckpt/ckpt_best.pth.tar', + collect_count=100000, ), policy=dict( cuda=False, @@ -35,13 +38,6 @@ adv_norm=True, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=800, unroll_len=1, @@ -65,5 +61,19 @@ create_config = lunarlander_ppo_create_config if __name__ == "__main__": - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost([main_config, create_config], seed=0) + # or you can enter `ding -m serial -c lunarlander_ppo_offpolicy_config.py -s 0` + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.box2d.lunarlander.config.lunarlander_offppo_config import lunarlander_ppo_offpolicy_config, lunarlander_ppo_offpolicy_create_config + + expert_cfg = (lunarlander_ppo_offpolicy_config, lunarlander_ppo_offpolicy_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/halfcheetah_bdq_config.py b/dizoo/mujoco/config/halfcheetah_bdq_config.py index 145bf8062e..25fb65ba35 100644 --- a/dizoo/mujoco/config/halfcheetah_bdq_config.py +++ b/dizoo/mujoco/config/halfcheetah_bdq_config.py @@ -22,7 +22,6 @@ action_bins_per_branch=2, # mean the action shape is 6, 2 discrete actions for each action dimension encoder_hidden_size_list=[256, 256, 128], ), - learn=dict( batch_size=512, learning_rate=3e-4, @@ -65,4 +64,8 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c halfcheetah_onbdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline((main_config, create_config), seed=0, max_env_step=10000000,) \ No newline at end of file + serial_pipeline( + (main_config, create_config), + seed=0, + max_env_step=10000000, + ) diff --git a/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py b/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py index 367b7bcf03..6b06a1094b 100644 --- a/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py @@ -17,7 +17,10 @@ batch_size=32, action_shape=6, continuous=True, + expert_data_path='halfcheetah_sac_seed0/expert_data.pkl', + expert_model_path='halfcheetah_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, + collect_count=300000, ), policy=dict( cuda=False, @@ -45,13 +48,6 @@ auto_alpha=False, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=256, unroll_len=1, @@ -82,5 +78,18 @@ create_config = halfcheetah_gcl_sac_create_config if __name__ == '__main__': - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.mujoco.config.halfcheetah_sac_config import halfcheetah_sac_config, halfcheetah_sac_create_config + + expert_cfg = (halfcheetah_sac_config, halfcheetah_sac_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py index fcd2d3f179..30f9581771 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py @@ -98,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/hopper_bdq_config.py b/dizoo/mujoco/config/hopper_bdq_config.py index de08da2a7a..34dbe21664 100644 --- a/dizoo/mujoco/config/hopper_bdq_config.py +++ b/dizoo/mujoco/config/hopper_bdq_config.py @@ -68,4 +68,8 @@ if __name__ == "__main__": # or you can enter `ding -m serial_onpolicy -c hopper_bdq_config.py -s 0` from ding.entry import serial_pipeline - serial_pipeline([main_config, create_config], seed=0, max_env_step=10000000,) + serial_pipeline( + [main_config, create_config], + seed=0, + max_env_step=10000000, + ) diff --git a/dizoo/mujoco/config/hopper_gcl_config.py b/dizoo/mujoco/config/hopper_gcl_config.py index 214f44dbf7..d098288ca0 100644 --- a/dizoo/mujoco/config/hopper_gcl_config.py +++ b/dizoo/mujoco/config/hopper_gcl_config.py @@ -17,7 +17,10 @@ batch_size=32, action_shape=3, continuous=True, + expert_data_path='hopper_sac_seed0/expert_data.pkl', + expert_model_path='hopper_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, + collect_count=100000, ), policy=dict( cuda=False, @@ -38,13 +41,6 @@ adv_norm=True, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=2048, unroll_len=1, @@ -70,5 +66,18 @@ create_config = hopper_gcl_create_config if __name__ == '__main__': - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.mujoco.config.hopper_sac_config import hopper_sac_config, hopper_sac_create_config + + expert_cfg = (hopper_sac_config, hopper_sac_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/hopper_trex_sac_config.py b/dizoo/mujoco/config/hopper_trex_sac_config.py index b582773f86..f37d39240c 100644 --- a/dizoo/mujoco/config/hopper_trex_sac_config.py +++ b/dizoo/mujoco/config/hopper_trex_sac_config.py @@ -98,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/walker2d_gcl_config.py b/dizoo/mujoco/config/walker2d_gcl_config.py index 1b0b56fa32..08c0fa5bef 100644 --- a/dizoo/mujoco/config/walker2d_gcl_config.py +++ b/dizoo/mujoco/config/walker2d_gcl_config.py @@ -17,7 +17,10 @@ batch_size=32, action_shape=6, continuous=True, + expert_data_path='walker2d_sac_seed0/expert_data.pkl', + expert_model_path='walker2d_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, + collect_count=100000, ), policy=dict( cuda=False, @@ -38,13 +41,6 @@ adv_norm=True, ), collect=dict( - # Users should add their own model path here. Model path should lead to a model. - # Absolute path is recommended. - # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. - model_path='model_path_placeholder', - # If you need the data collected by the collector to contain logit key which reflect the probability of - # the action, you can change the key to be True. - # In Guided cost Learning, we need to use logit to train the reward model, we change the key to be True. collector_logit=True, n_sample=2048, unroll_len=1, @@ -71,5 +67,18 @@ create_config = walker2d_gcl_create_config if __name__ == '__main__': - from ding.entry import serial_pipeline_guided_cost - serial_pipeline_guided_cost((main_config, create_config), seed=0) + from ding.entry import collect_demo_data, serial_pipeline_reward_model_offpolicy + from dizoo.mujoco.config.walker2d_sac_config import walker2d_sac_config, walker2d_sac_create_config + + expert_cfg = (walker2d_sac_config, walker2d_sac_create_config) + expert_data_path = main_config.reward_model.expert_data_path + state_dict_path = main_config.reward_model.expert_model_path + collect_count = main_config.reward_model.collect_count + collect_demo_data( + expert_cfg, + seed=0, + state_dict_path=state_dict_path, + expert_data_path=expert_data_path, + collect_count=collect_count + ) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/walker2d_trex_onppo_config.py b/dizoo/mujoco/config/walker2d_trex_onppo_config.py index fea2611956..fb9e5a930a 100644 --- a/dizoo/mujoco/config/walker2d_trex_onppo_config.py +++ b/dizoo/mujoco/config/walker2d_trex_onppo_config.py @@ -88,4 +88,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) diff --git a/dizoo/mujoco/config/walker2d_trex_sac_config.py b/dizoo/mujoco/config/walker2d_trex_sac_config.py index 6bad2bc6a9..c0588067e5 100644 --- a/dizoo/mujoco/config/walker2d_trex_sac_config.py +++ b/dizoo/mujoco/config/walker2d_trex_sac_config.py @@ -97,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config],pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) From 919c01bd82d56e4455d08d0cefe3c29ecde6f553 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 22 May 2023 02:16:09 -0400 Subject: [PATCH 43/52] fix bug for test, remove wrong comment --- ding/entry/__init__.py | 1 - ding/entry/tests/test_application_entry_drex_collect_data.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ding/entry/__init__.py b/ding/entry/__init__.py index 0dfaa73db3..15935c5793 100644 --- a/ding/entry/__init__.py +++ b/ding/entry/__init__.py @@ -16,7 +16,6 @@ from .application_entry import eval, collect_demo_data, collect_episodic_demo_data, \ episode_to_transitions, episode_to_transitions_filter from .application_entry_trex_collect_data import trex_collecting_data, collect_episodic_demo_data_for_trex -from .serial_entry_guided_cost import serial_pipeline_guided_cost from .utils import random_collect from .application_entry_drex_collect_data import drex_collecting_data from .serial_entry_mbrl import serial_pipeline_dyna, serial_pipeline_dream diff --git a/ding/entry/tests/test_application_entry_drex_collect_data.py b/ding/entry/tests/test_application_entry_drex_collect_data.py index c7942fdfaf..de1ded03ee 100644 --- a/ding/entry/tests/test_application_entry_drex_collect_data.py +++ b/ding/entry/tests/test_application_entry_drex_collect_data.py @@ -70,5 +70,5 @@ def test_drex_collecting_data(): args.cfg[0].bc_iteration = 1000 # for unittest args.cfg[1].policy.type = 'bc' drex_collecting_data(args=args) - #os.popen('rm -rf {}'.format(expert_policy_state_dict_path)) - #os.popen('rm -rf {}'.format(args.cfg[0].reward_model.offline_data_path)) + os.popen('rm -rf {}'.format(expert_policy_state_dict_path)) + os.popen('rm -rf {}'.format(args.cfg[0].reward_model.offline_data_path)) From a1d0b3a12a83295072cd2887661416c82a17258d Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 5 Jun 2023 23:17:42 -0400 Subject: [PATCH 44/52] polish code for ngu, drex, base rm and entry --- ding/entry/serial_entry_reward_model_offpolicy.py | 4 +--- ding/entry/serial_entry_reward_model_onpolicy.py | 4 +--- ding/policy/ngu.py | 2 +- ding/reward_model/base_reward_model.py | 5 ++++- ding/reward_model/drex_reward_model.py | 6 ++---- 5 files changed, 9 insertions(+), 12 deletions(-) diff --git a/ding/entry/serial_entry_reward_model_offpolicy.py b/ding/entry/serial_entry_reward_model_offpolicy.py index c9cbc40259..fc49ddd711 100644 --- a/ding/entry/serial_entry_reward_model_offpolicy.py +++ b/ding/entry/serial_entry_reward_model_offpolicy.py @@ -92,7 +92,6 @@ def serial_pipeline_reward_model_offpolicy( # Accumulate plenty of data at the beginning of training. if cfg.policy.get('random_collect_size', 0) > 0: random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - count = 0 best_reward = -np.inf while True: collect_kwargs = commander.step() @@ -116,7 +115,7 @@ def serial_pipeline_reward_model_offpolicy( if cooptrain_reward: reward_model.train() # clear buffer per fix iters to make sure replay buffer's data count isn't too few. - reward_model.clear_data(iter=count) + reward_model.clear_data(iter=learner.train_iter) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): # Learner will train ``update_per_collect`` times in one iteration. @@ -135,7 +134,6 @@ def serial_pipeline_reward_model_offpolicy( replay_buffer.update(learner.priority_info) if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: break - count += 1 # Learner's after_run hook. learner.call_hook('after_run') diff --git a/ding/entry/serial_entry_reward_model_onpolicy.py b/ding/entry/serial_entry_reward_model_onpolicy.py index 975b32f461..a978bc4b5a 100644 --- a/ding/entry/serial_entry_reward_model_onpolicy.py +++ b/ding/entry/serial_entry_reward_model_onpolicy.py @@ -91,7 +91,6 @@ def serial_pipeline_reward_model_onpolicy( # Accumulate plenty of data at the beginning of training. if cfg.policy.get('random_collect_size', 0) > 0: random_collect(cfg.policy, policy, collector, collector_env, commander, replay_buffer) - count = 0 best_reward = -np.inf while True: collect_kwargs = commander.step() @@ -114,7 +113,7 @@ def serial_pipeline_reward_model_onpolicy( # update reward_model if cooptrain_reward: reward_model.train() - reward_model.clear_data(iter=count) + reward_model.clear_data(iter=learner.train_iter) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): # Learner will train ``update_per_collect`` times in one iteration. @@ -133,7 +132,6 @@ def serial_pipeline_reward_model_onpolicy( replay_buffer.update(learner.priority_info) if collector.envstep >= max_env_step or learner.train_iter >= max_train_iter: break - count += 1 # Learner's after_run hook. learner.call_hook('after_run') diff --git a/ding/policy/ngu.py b/ding/policy/ngu.py index efe2ee5b80..8c2ef6f48b 100644 --- a/ding/policy/ngu.py +++ b/ding/policy/ngu.py @@ -431,7 +431,7 @@ def _init_collect(self) -> None: # epsilon=0.4, alpha=9 self.eps = {i: 0.4 ** (1 + 8 * i / (self._cfg.collect.env_num - 1)) for i in range(self._cfg.collect.env_num)} - def _forward_collect(self, data: dict, eps: Optional[float]) -> dict: + def _forward_collect(self, data: dict, eps: float) -> dict: r""" Overview: Collect output according to eps_greedy plugin diff --git a/ding/reward_model/base_reward_model.py b/ding/reward_model/base_reward_model.py index 930975e8da..ac52304d0b 100644 --- a/ding/reward_model/base_reward_model.py +++ b/ding/reward_model/base_reward_model.py @@ -63,7 +63,10 @@ def collect_data(self, data) -> None: def clear_data(self, iter: int) -> None: """ Overview: - Clearing training data. \ + Clearing training data. + Arguments: + - iter (:obj:`int`): Current training iteration + Returns / Effects: This can be a side effect function which clears the data attribute in ``self`` """ raise NotImplementedError() diff --git a/ding/reward_model/drex_reward_model.py b/ding/reward_model/drex_reward_model.py index a4727e6d12..ea11b53f02 100644 --- a/ding/reward_model/drex_reward_model.py +++ b/ding/reward_model/drex_reward_model.py @@ -67,6 +67,7 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.demo_data = [] self.load_expert_data() + self._logger.info("device: {}".format(device)) def load_expert_data(self) -> None: """ @@ -82,10 +83,7 @@ def load_expert_data(self) -> None: self.demo_data = pickle.load(f) def train(self): - # check if gpu available - device = self.device # torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - # Assume that we are on a CUDA machine, then this should print a CUDA device: - self._logger.info("device: {}".format(device)) + training_inputs, training_outputs = self.training_obs, self.training_labels cum_loss = 0.0 From e310b4c734c0b579bd6d8b5d684ea7dde2b30f5a Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 6 Jun 2023 01:10:01 -0400 Subject: [PATCH 45/52] polish code for all rm --- ding/reward_model/gail_irl_model.py | 9 +-- ding/reward_model/guided_cost_reward_model.py | 10 --- ding/reward_model/icm_reward_model.py | 1 - ding/reward_model/network.py | 8 +-- ding/reward_model/ngu_reward_model.py | 70 ++++++++++++++++++- ding/reward_model/pdeil_irl_model.py | 1 - ding/reward_model/reword_model_utils.py | 4 +- ding/reward_model/rnd_reward_model.py | 1 - ding/reward_model/trex_reward_model.py | 4 +- 9 files changed, 75 insertions(+), 33 deletions(-) diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index b638671ac1..52b6aadc55 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -1,7 +1,6 @@ from typing import List, Dict, Any import pickle import random -from collections.abc import Iterable from easydict import EasyDict import torch @@ -12,7 +11,6 @@ from .base_reward_model import BaseRewardModel from .reword_model_utils import concat_state_action_pairs from .network import GAILNetwork -import torch.nn.functional as F from functools import partial @@ -215,8 +213,11 @@ def collect_data(self, data: list) -> None: def clear_data(self, iter: int) -> None: """ Overview: - Clearing training data. \ - This is a side effect function which clears the data attribute in ``self`` + Clearing training data. + Arguments: + - iter (:obj:`int`): Current training iteration + Returns / Effects: + This can be a side effect function which clears the data attribute in ``self`` """ assert hasattr( self.cfg, 'clear_buffer_per_iters' diff --git a/ding/reward_model/guided_cost_reward_model.py b/ding/reward_model/guided_cost_reward_model.py index 15e95fd078..1c237f2651 100644 --- a/ding/reward_model/guided_cost_reward_model.py +++ b/ding/reward_model/guided_cost_reward_model.py @@ -1,11 +1,9 @@ from typing import List, Dict, Any from easydict import EasyDict -from ditk import logging import pickle import random import torch -import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from torch.distributions import Independent, Normal @@ -38,10 +36,6 @@ class GuidedCostRewardModel(BaseRewardModel): 5 | ``batch_size`` int 64 | Training batch size | 6 | ``hidden_size`` int 128 | Linear model hidden size | 7 | ``action_shape`` int 1 | Action space shape | - 8 | ``log_every_n`` int 50 | add loss to log every n iteration | - | ``_train`` | | - 9 | ``store_model_`` int 100 | save model every n iteration | - | ``every_n_train`` | == ==================== ======== ============= ======================================== ================ """ @@ -63,10 +57,6 @@ class GuidedCostRewardModel(BaseRewardModel): # Bigger "update_per_collect" means bigger off-policy. # collect data -> update policy-> collect data -> ... update_per_collect=100, - # (int) Add loss to log every n iteration. - log_every_n_train=50, - # (int) Save model every n iteration. - store_model_every_n_train=100, ) def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> None: # noqa diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index f3a9dbfbba..98c37d70ab 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -3,7 +3,6 @@ import random import torch -import torch.nn as nn import torch.optim as optim from ding.utils import REWARD_MODEL_REGISTRY diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index 1182d70cf1..ca77eaeacc 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -1,19 +1,13 @@ from typing import Union, Tuple, List, Dict, Optional -from easydict import EasyDict import torch import torch.nn as nn import torch.nn.functional as F -from torch.distributions import Independent, Normal -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY +from ding.utils import SequenceType from ding.model import FCEncoder, ConvEncoder -from ding.torch_utils.data_helper import to_tensor from ding.torch_utils import one_hot from ding.utils import RunningMeanStd -from ding.utils.data import default_collate -from .reword_model_utils import concat_state_action_pairs -from functools import partial class RepresentationNetwork(nn.Module): diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index c5a5e23284..cef21af27c 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -1,6 +1,5 @@ import copy import random -from typing import Any import numpy as np import torch @@ -9,7 +8,7 @@ from easydict import EasyDict from ding.utils import RunningMeanStd -from ding.utils import SequenceType, REWARD_MODEL_REGISTRY +from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel from .network import RNDNetwork, InverseNetwork @@ -414,41 +413,106 @@ def fusion_reward( @REWARD_MODEL_REGISTRY.register('ngu-reward') class NGURewardModel(BaseRewardModel): - r""" + """ Overview: The unifying reward for ngu which combined rnd-ngu and episodic The corresponding paper is `never give up: learning directed exploration strategies`. + Interface: + ``__init__``, ``train``, ``estimate``, ``collect_data``, ``clear_data`` + Config: + == ==================== ======== ============= ==================================== ======================= + ID Symbol Type Default Value Description Other(Shape) + == ==================== ======== ============= ==================================== ======================= + 1 ``type`` str ngu-reward | Reward model register name, | + | refer to registry | + | ``REWARD_MODEL_REGISTRY`` | + 2 | ``intrinsic_`` str add | the intrinsic reward type | including add, new + | ``reward_type`` | | , or assign + 3 | ``policy_nstep`` int 1 | the nstep of policy | + 4 | ``nstep`` int 1 | the nstep of reward | + 5 | ``learning_rate`` float 0.001 | The step size of gradient descent | + 6 | ``obs_shape`` Tuple( 4 | the observation shape | + [int, + list]) + 7 | ``action_shape`` int 2 | the action space shape | + 8 | ``batch_size`` int 64 | Training batch size | + 9 | ``hidden`` list [64, 64, | Sequence of ``hidden_size`` | + | ``_size_list`` (int) 128] | of reward network. | + 10 | ``update_per_`` int 100 | Number of updates per collect | + | ``collect`` | | + 11 | ``only_use_`` float 1 | Whether to only use last | + | ``last_five_`` | five frames for ICM/RND. | + | ``frames_for_`` | | + | ``icm_rnd`` | | + 12 | ``last_nonzero_`` bool False | Whether to rescale | used in episode rm + ``reward_rescale`` | last nonzero reward. + 13 | ``last_nonzero_`` int 1 | The weight of last nonzero reward. | used in episode rm + ``reward_weight`` | | + 14 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay + ``_per_iters`` | buffer's data count + | isn't too few. + | (code work in entry) + == ==================== ======== ============= ==================================== ======================= """ config = dict( + # (str) Reward model register name, refer to registry ``REWARD_MODEL_REGISTRY``. type='ngu-reward', + # (int) nstep for rl algorithms used in episode reward model policy_nstep=5, + # (int) number of envs for collecting data collect_env_num=8, rnd_reward_model=dict( + # (str) The intrinsic reward type, including add, new, or assign. intrinsic_reward_type='add', + # (float) The step size of gradient descent. learning_rate=5e-4, + # (Tuple[int, list]) The observation shape. obs_shape=4, + # (int) The action space shape. action_shape=2, + # (int) Training batch size. batch_size=128, # transitions + # (int) Number of updates per collect. update_per_collect=10, + # (bool) Whether to only use last five frames for ICM/RND. only_use_last_five_frames_for_icm_rnd=False, + # (int) Clear buffer per fixed iters, make sure replay buffer's data count isn't too few. clear_buffer_per_iters=10, + # (int) nstep for rl algorithms used in rnd reward model nstep=5, + # (list(int)) Sequence of ``hidden_size`` of reward network. + # If obs.shape == 1, use MLP layers. + # If obs.shape == 3, use conv layer and final dense layer. hidden_size_list=[128, 128, 64], + # (str) The reward model type, which must be rnd-ngu type='rnd-ngu', ), episodic_reward_model=dict( + # (bool) Whether to rescale last nonzero reward. last_nonzero_reward_rescale=False, + # (int) The weight of last nonzero reward. last_nonzero_reward_weight=1, + # (str) The intrinsic reward type, including add, new, or assign. intrinsic_reward_type='add', + # (float) The step size of gradient descent. learning_rate=5e-4, + # (Tuple[int, list]) The observation shape. obs_shape=4, + # (int) The action space shape. action_shape=2, + # (int) Training batch size. batch_size=128, # transitions + # (int) Number of updates per collect. update_per_collect=10, + # (bool) Whether to only use last five frames for ICM/RND. only_use_last_five_frames_for_icm_rnd=False, + # (int) Clear buffer per fixed iters, make sure replay buffer's data count isn't too few. clear_buffer_per_iters=10, + # (int) nstep for rl algorithms used in episode reward model nstep=5, + # (list(int)) Sequence of ``hidden_size`` of reward network. hidden_size_list=[128, 128, 64], + # (str) The reward model type, which must be episodic type='episodic', ), ) diff --git a/ding/reward_model/pdeil_irl_model.py b/ding/reward_model/pdeil_irl_model.py index 4cea543933..9cf37d84b8 100644 --- a/ding/reward_model/pdeil_irl_model.py +++ b/ding/reward_model/pdeil_irl_model.py @@ -3,7 +3,6 @@ import numpy as np import torch import pickle -import scipy.stats as stats try: from sklearn.svm import SVC except ImportError: diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reword_model_utils.py index c94dc69062..91cdd85cfa 100644 --- a/ding/reward_model/reword_model_utils.py +++ b/ding/reward_model/reword_model_utils.py @@ -1,10 +1,8 @@ -from typing import Union, Optional, List, Any, Tuple +from typing import Optional, List, Any from collections.abc import Iterable from easydict import EasyDict import torch -import torch.optim as optim -import torch.nn.functional as F from ding.torch_utils import one_hot from ding.utils import RunningMeanStd diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index d7ae37e787..528c27838a 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -10,7 +10,6 @@ from .reword_model_utils import combine_intrinsic_exterinsic_reward, collect_states, obs_norm from .network import RNDNetwork from ding.utils import RunningMeanStd -from ding.torch_utils.data_helper import to_tensor import numpy as np diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index 034b9bbd32..b2da1d0c25 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Tuple, Optional, List, Dict +from typing import Tuple, List, Dict from easydict import EasyDict from ditk import logging import pickle @@ -11,8 +11,6 @@ import torch.optim as optim from ding.utils import REWARD_MODEL_REGISTRY -from ding.utils import SequenceType -from ding.model.common import FCEncoder from ding.utils import build_logger from ding.utils.data import default_collate From 92dc227958dcb803bd495907df36a78534c09f4a Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 6 Jun 2023 01:15:58 -0400 Subject: [PATCH 46/52] fix style for ngu --- ding/reward_model/ngu_reward_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index cef21af27c..0092066904 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -441,7 +441,7 @@ class NGURewardModel(BaseRewardModel): 10 | ``update_per_`` int 100 | Number of updates per collect | | ``collect`` | | 11 | ``only_use_`` float 1 | Whether to only use last | - | ``last_five_`` | five frames for ICM/RND. | + | ``last_five_`` | five frames for ICM/RND. | | ``frames_for_`` | | | ``icm_rnd`` | | 12 | ``last_nonzero_`` bool False | Whether to rescale | used in episode rm @@ -452,7 +452,7 @@ class NGURewardModel(BaseRewardModel): ``_per_iters`` | buffer's data count | isn't too few. | (code work in entry) - == ==================== ======== ============= ==================================== ======================= + == ==================== ======== ============= ==================================== ======================= """ config = dict( # (str) Reward model register name, refer to registry ``REWARD_MODEL_REGISTRY``. From a4f364dee1e54a78bf849693f80a7efb065f33b9 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 6 Jun 2023 02:15:41 -0400 Subject: [PATCH 47/52] polish comment for config files --- dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py | 4 ++++ .../atari/config/serial/qbert/qbert_trex_offppo_config.py | 4 ++++ dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py | 7 +++++++ .../classic_control/cartpole/config/cartpole_gcl_config.py | 7 +++++++ .../cartpole/config/cartpole_trex_dqn_config.py | 3 +++ .../cartpole/config/cartpole_trex_offppo_config.py | 3 +++ .../cartpole/config/cartpole_trex_onppo_config.py | 3 +++ dizoo/mujoco/config/halfcheetah_gcl_sac_config.py | 7 +++++++ dizoo/mujoco/config/hopper_gcl_config.py | 7 +++++++ dizoo/mujoco/config/walker2d_gcl_config.py | 7 +++++++ 10 files changed, 52 insertions(+) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py index de09013946..ec135e4399 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py @@ -21,6 +21,10 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='qbert_dqn_seed0', hidden_size_list=[512, 64, 1], obs_shape=[4, 84, 84], diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py index 7a82f02afc..0076f56f26 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py @@ -21,6 +21,10 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. + # However, here in ``expert_model_path``, it is ``exp_name`` of the expert config. expert_model_path='qbert_ppo_seed0', hidden_size_list=[512, 64, 1], obs_shape=[4, 84, 84], diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py index 85932f9928..2def4718fe 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gcl_config.py @@ -15,7 +15,14 @@ batch_size=32, continuous=False, update_per_collect=20, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' expert_data_path='lunarlander_ppo_offpolicy_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. expert_model_path='lunarlander_ppo_offpolicy_seed0/ckpt/ckpt_best.pth.tar', collect_count=100000, ), diff --git a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py index a6ea193ef1..d71eaf3adb 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_gcl_config.py @@ -13,7 +13,14 @@ input_size=5, batch_size=32, continuous=False, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' expert_data_path='cartpole_ppo_offpolicy_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. expert_model_path='cartpole_ppo_offpolicy_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=10, collect_count=1000, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py index 9e6f20eaf3..34363359a9 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py @@ -20,6 +20,9 @@ update_per_collect=1, num_trajs=6, num_snippets=6000, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name``. expert_model_path='cartpole_dqn_seed0', # expert model experiment directory path hidden_size_list=[512, 64, 1], obs_shape=4, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index 8cde57b708..71d0ae73a8 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -20,6 +20,9 @@ update_per_collect=1, num_trajs=0, num_snippets=6000, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name``. expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path hidden_size_list=[512, 64, 1], obs_shape=4, diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 3faf422ead..3f1ecfc44d 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -18,6 +18,9 @@ checkpoint_step=100, learning_rate=1e-5, update_per_collect=1, + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name``. expert_model_path='cartpole_ppo_seed0', # expert model experiment directory path hidden_size_list=[512, 64, 1], obs_shape=4, diff --git a/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py b/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py index 6b06a1094b..05781959c0 100644 --- a/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gcl_sac_config.py @@ -17,7 +17,14 @@ batch_size=32, action_shape=6, continuous=True, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' expert_data_path='halfcheetah_sac_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. expert_model_path='halfcheetah_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, collect_count=300000, diff --git a/dizoo/mujoco/config/hopper_gcl_config.py b/dizoo/mujoco/config/hopper_gcl_config.py index d098288ca0..d7299367c1 100644 --- a/dizoo/mujoco/config/hopper_gcl_config.py +++ b/dizoo/mujoco/config/hopper_gcl_config.py @@ -17,7 +17,14 @@ batch_size=32, action_shape=3, continuous=True, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' expert_data_path='hopper_sac_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. expert_model_path='hopper_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, collect_count=100000, diff --git a/dizoo/mujoco/config/walker2d_gcl_config.py b/dizoo/mujoco/config/walker2d_gcl_config.py index 08c0fa5bef..b741359067 100644 --- a/dizoo/mujoco/config/walker2d_gcl_config.py +++ b/dizoo/mujoco/config/walker2d_gcl_config.py @@ -17,7 +17,14 @@ batch_size=32, action_shape=6, continuous=True, + # Users should add their own data path here. Data path should lead to a file to store data or load the stored data. + # Absolute path is recommended. + # In DI-engine, it is usually located in ``exp_name`` directory + # e.g. 'exp_name/expert_data.pkl' expert_data_path='walker2d_sac_seed0/expert_data.pkl', + # Users should add their own model path here. Model path should lead to a model. + # Absolute path is recommended. + # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``. expert_model_path='walker2d_sac_seed0/ckpt/ckpt_best.pth.tar', update_per_collect=20, collect_count=100000, From 1f06dec7bd81cbfe8e4b051856d93ceb6d325d5e Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Fri, 9 Jun 2023 02:24:18 -0400 Subject: [PATCH 48/52] add gcl unit test --- .../tests/test_serial_entry_reward_model.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 4dc3cd91fe..20922c6991 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -1,6 +1,5 @@ import pytest import os -from ditk import logging from easydict import EasyDict from copy import deepcopy @@ -23,17 +22,20 @@ 'type': 'pdeil', "alpha": 0.5, "discrete_action": False - }, { + }, + { 'type': 'gail', 'input_size': 5, 'hidden_size_list': [64], 'batch_size': 64, - }, { + }, + { 'type': 'pwil', 's_size': 4, 'a_size': 2, 'sample_size': 500, - }, { + }, + { 'type': 'red', 'sample_size': 5000, 'obs_shape': 4, @@ -41,7 +43,16 @@ 'hidden_size_list': [64, 1], 'update_per_collect': 200, 'batch_size': 128, - } + }, + { + 'type': 'guided_cost', + 'learning_rate': 0.001, + 'input_size': 5, + 'batch_size': 32, + 'continuous': False, + 'update_per_collect': 10, + 'collect_count': 1000, + }, ] From a547b3b0fe332b3afa20baee37e75b9723f4e242 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Mon, 19 Jun 2023 02:43:37 -0400 Subject: [PATCH 49/52] polish RM --- .../serial_entry_reward_model_offpolicy.py | 17 ++- .../serial_entry_reward_model_onpolicy.py | 12 +- .../tests/test_serial_entry_reward_model.py | 18 ++- ding/reward_model/__init__.py | 2 +- ding/reward_model/gail_irl_model.py | 2 +- ding/reward_model/icm_reward_model.py | 56 ++++--- ding/reward_model/network.py | 55 ++++--- ding/reward_model/ngu_reward_model.py | 2 +- ding/reward_model/pwil_irl_model.py | 2 +- ding/reward_model/red_irl_model.py | 2 +- ...d_model_utils.py => reward_model_utils.py} | 0 ding/reward_model/rnd_reward_model.py | 4 +- .../tests/test_reward_model_utils.py | 2 +- ding/reward_model/trex_reward_model.py | 2 +- .../serial/montezuma/montezuma_ngu_config.py | 131 ----------------- .../serial/pitfall/pitfall_ngu_config.py | 138 ------------------ .../serial/pong/pong_gail_dqn_config.py | 2 +- .../serial/pong/pong_trex_offppo_config.py | 2 +- .../serial/pong/pong_trex_sql_config.py | 2 +- .../serial/qbert/qbert_trex_dqn_config.py | 2 +- .../serial/qbert/qbert_trex_offppo_config.py | 2 +- .../spaceinvaders_trex_dqn_config.py | 2 +- .../spaceinvaders_trex_offppo_config.py | 2 +- .../config/bipedalwalker_gail_sac_config.py | 2 +- .../config/lunarlander_drex_dqn_config.py | 2 +- .../config/lunarlander_gail_dqn_config.py | 2 +- .../config/lunarlander_ngu_config.py | 8 +- .../config/lunarlander_trex_dqn_config.py | 2 +- .../config/lunarlander_trex_offppo_config.py | 2 +- .../config/cartpole_drex_dqn_config.py | 2 +- .../config/cartpole_trex_dqn_config.py | 2 +- .../config/cartpole_trex_offppo_config.py | 2 +- .../config/cartpole_trex_onppo_config.py | 2 +- dizoo/mujoco/config/ant_gail_sac_config.py | 2 +- dizoo/mujoco/config/ant_trex_onppo_config.py | 15 +- dizoo/mujoco/config/ant_trex_sac_config.py | 15 +- .../config/halfcheetah_gail_sac_config.py | 2 +- .../config/halfcheetah_trex_onppo_config.py | 2 +- .../config/halfcheetah_trex_sac_config.py | 2 +- .../mujoco/config/hopper_trex_onppo_config.py | 2 +- dizoo/mujoco/config/hopper_trex_sac_config.py | 2 +- .../config/walker2d_gail_ddpg_config.py | 20 ++- .../mujoco/config/walker2d_gail_sac_config.py | 2 +- .../config/walker2d_trex_onppo_config.py | 2 +- .../mujoco/config/walker2d_trex_sac_config.py | 2 +- 45 files changed, 176 insertions(+), 377 deletions(-) rename ding/reward_model/{reword_model_utils.py => reward_model_utils.py} (100%) delete mode 100644 dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py delete mode 100644 dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py diff --git a/ding/entry/serial_entry_reward_model_offpolicy.py b/ding/entry/serial_entry_reward_model_offpolicy.py index fc49ddd711..b4a4b84bc8 100644 --- a/ding/entry/serial_entry_reward_model_offpolicy.py +++ b/ding/entry/serial_entry_reward_model_offpolicy.py @@ -24,8 +24,8 @@ def serial_pipeline_reward_model_offpolicy( model: Optional[torch.nn.Module] = None, max_train_iter: Optional[int] = int(1e10), max_env_step: Optional[int] = int(1e10), - cooptrain_reward: Optional[bool] = True, - pretrain_reward: Optional[bool] = False, + cooptrain_reward_model: Optional[bool] = True, + pretrain_reward_model: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -40,6 +40,8 @@ def serial_pipeline_reward_model_offpolicy( - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. + - cooptrain_reward_model (:obj:`Optional[bool]`): Whether train reward model during policy training. + - pretrain_reward_model (:obj:`Optional[bool]`): Whether train reward model before policy training. Returns: - policy (:obj:`Policy`): Converged policy. """ @@ -80,7 +82,7 @@ def serial_pipeline_reward_model_offpolicy( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - if pretrain_reward: + if pretrain_reward_model: reward_model.train() # ========== @@ -108,13 +110,14 @@ def serial_pipeline_reward_model_offpolicy( while new_data_count < target_new_data_count: new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) new_data_count += len(new_data) - # collect data for reward_model training - reward_model.collect_data(new_data) replay_buffer.push(new_data, cur_collector_envstep=collector.envstep) + if cooptrain_reward_model: + # collect data for reward_model training + reward_model.collect_data(new_data) # update reward_model, when you want to train reward_model inloop - if cooptrain_reward: + if cooptrain_reward_model: reward_model.train() - # clear buffer per fix iters to make sure replay buffer's data count isn't too few. + # clear buffer per fixed iters to make sure the data for RM training is not too offpolicy. reward_model.clear_data(iter=learner.train_iter) # Learn policy from collected data for i in range(cfg.policy.learn.update_per_collect): diff --git a/ding/entry/serial_entry_reward_model_onpolicy.py b/ding/entry/serial_entry_reward_model_onpolicy.py index a978bc4b5a..f1a3c388ab 100644 --- a/ding/entry/serial_entry_reward_model_onpolicy.py +++ b/ding/entry/serial_entry_reward_model_onpolicy.py @@ -24,8 +24,8 @@ def serial_pipeline_reward_model_onpolicy( model: Optional[torch.nn.Module] = None, max_train_iter: Optional[int] = int(1e10), max_env_step: Optional[int] = int(1e10), - cooptrain_reward: Optional[bool] = True, - pretrain_reward: Optional[bool] = False, + cooptrain_reward_model: Optional[bool] = True, + pretrain_reward_model: Optional[bool] = False, ) -> 'Policy': # noqa """ Overview: @@ -40,6 +40,8 @@ def serial_pipeline_reward_model_onpolicy( - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module. - max_train_iter (:obj:`Optional[int]`): Maximum policy update iterations in training. - max_env_step (:obj:`Optional[int]`): Maximum collected environment interaction steps. + - cooptrain_reward_model (:obj:`Optional[bool]`): Whether train reward model during policy training. + - pretrain_reward_model (:obj:`Optional[bool]`): Whether train reward model before policy training. Returns: - policy (:obj:`Policy`): Converged policy. """ @@ -80,7 +82,7 @@ def serial_pipeline_reward_model_onpolicy( cfg.policy.other.commander, learner, collector, evaluator, replay_buffer, policy.command_mode ) reward_model = create_reward_model(cfg.reward_model, policy.collect_mode.get_attribute('device'), tb_logger) - if pretrain_reward: + if pretrain_reward_model: reward_model.train() # ========== # Main loop @@ -108,10 +110,10 @@ def serial_pipeline_reward_model_onpolicy( new_data = collector.collect(train_iter=learner.train_iter, policy_kwargs=collect_kwargs) new_data_count += len(new_data) # collect data for reward_model training - if cooptrain_reward: + if cooptrain_reward_model: reward_model.collect_data(new_data) # update reward_model - if cooptrain_reward: + if cooptrain_reward_model: reward_model.train() reward_model.clear_data(iter=learner.train_iter) # Learn policy from collected data diff --git a/ding/entry/tests/test_serial_entry_reward_model.py b/ding/entry/tests/test_serial_entry_reward_model.py index 20922c6991..b971cfe137 100644 --- a/ding/entry/tests/test_serial_entry_reward_model.py +++ b/ding/entry/tests/test_serial_entry_reward_model.py @@ -84,14 +84,14 @@ def test_irl(reward_model_config): reward_model_config['expert_data_path'] = expert_data_path cp_cartpole_dqn_config.reward_model = reward_model_config cp_cartpole_dqn_config.policy.collect.n_sample = 128 - cooptrain_reward = True - pretrain_reward = False + cooptrain_reward_model = True + pretrain_reward_model = False serial_pipeline_reward_model_offpolicy( (cp_cartpole_dqn_config, cp_cartpole_dqn_create_config), seed=0, max_train_iter=2, - pretrain_reward=pretrain_reward, - cooptrain_reward=cooptrain_reward + pretrain_reward_model=pretrain_reward_model, + cooptrain_reward_model=cooptrain_reward_model ) os.popen("rm -rf ckpt_* log expert_data.pkl") @@ -103,6 +103,8 @@ def test_rnd(): serial_pipeline_reward_model_onpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + finally: + os.popen("rm -rf cartpole_ppo_rnd*") @pytest.mark.unittest @@ -112,6 +114,8 @@ def test_icm(): serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + finally: + os.popen("rm -rf cartpole_ppo_icm*") @pytest.mark.unittest @@ -121,6 +125,8 @@ def test_ngu(): serial_pipeline_reward_model_offpolicy(config, seed=0, max_train_iter=2) except Exception: assert False, "pipeline fail" + finally: + os.popen("rm -rf cartpole_ngu*") @pytest.mark.unittest @@ -143,7 +149,7 @@ def test_trex(): trex_collecting_data(args=args) try: serial_pipeline_reward_model_offpolicy( - config, seed=0, max_train_iter=1, pretrain_reward=True, cooptrain_reward=False + config, seed=0, max_train_iter=1, pretrain_reward_model=True, cooptrain_reward_model=False ) except Exception: assert False, "pipeline fail" @@ -178,7 +184,7 @@ def test_drex(): drex_collecting_data(args=args) try: serial_pipeline_reward_model_offpolicy( - config, seed=0, max_train_iter=1, pretrain_reward=True, cooptrain_reward=False + config, seed=0, max_train_iter=1, pretrain_reward_model=True, cooptrain_reward_model=False ) except Exception: assert False, "pipeline fail" diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py index 414728289b..47b5152b59 100644 --- a/ding/reward_model/__init__.py +++ b/ding/reward_model/__init__.py @@ -14,4 +14,4 @@ from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel, NGURewardModel from .icm_reward_model import ICMRewardModel from .network import RepresentationNetwork, RNDNetwork, REDNetwork, GAILNetwork, ICMNetwork, GCLNetwork, TREXNetwork -from .reword_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward, obs_norm, collect_states +from .reward_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward, obs_norm, collect_states diff --git a/ding/reward_model/gail_irl_model.py b/ding/reward_model/gail_irl_model.py index 52b6aadc55..b82e2d17e6 100644 --- a/ding/reward_model/gail_irl_model.py +++ b/ding/reward_model/gail_irl_model.py @@ -9,7 +9,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel -from .reword_model_utils import concat_state_action_pairs +from .reward_model_utils import concat_state_action_pairs from .network import GAILNetwork from functools import partial diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index 98c37d70ab..2cdc864362 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -8,7 +8,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel from .network import ICMNetwork -from .reword_model_utils import combine_intrinsic_exterinsic_reward +from .reward_model_utils import combine_intrinsic_exterinsic_reward def collect_states(iterator: list) -> Tuple[list, list, list]: @@ -48,19 +48,22 @@ class ICMRewardModel(BaseRewardModel): list]) 5 | ``action_shape`` int 7 | the action space shape | 6 | ``batch_size`` int 64 | Training batch size | - 7 | ``hidden`` list [64, 64, | the MLP layer shape | + 7 | ``residual_num`` int 4 | the residual number of residual net | + 8 | ``hidden`` list [64, 64, | the MLP layer shape | | ``_size_list`` (int) 128] | | - 8 | ``update_per_`` int 100 | Number of updates per collect | + 9 | ``inverse_`` int 512 | the inverse model hidden size | + | ``hidden_size`` + 10 | ``update_per_`` int 100 | Number of updates per collect | | ``collect`` | | - 9 | ``reverse_scale`` float 1 | the importance weight of the | - | forward and reverse loss | - 10 | ``intrinsic_`` float 0.003 | the weight of intrinsic reward | r = w*r_i + r_e + 11 | ``reverse_loss`` float 1 | the importance weight of the | + ``_weight`` | forward and reverse loss | + 12 | ``intrinsic_`` float 0.003 | the weight of intrinsic reward | r = w*r_i + r_e ``reward_weight`` - 11 | ``extrinsic_`` bool True | Whether to normlize + 13 | ``extrinsic_`` bool True | Whether to normlize ``reward_norm`` | extrinsic reward - 12 | ``extrinsic_`` int 1 | the upper bound of the reward + 14 | ``extrinsic_`` int 1 | the upper bound of the reward ``reward_norm_max`` | normalization - 13 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay + 15 | ``clear_buffer`` int 1 | clear buffer per fixed iters | make sure replay ``_per_iters`` | buffer's data count | isn't too few. | (code work in entry) @@ -81,12 +84,16 @@ class ICMRewardModel(BaseRewardModel): batch_size=64, # (list) The MLP layer shape. hidden_size_list=[64, 64, 128], + # (int) the residual number. + residual_num=4, + # (int) The hidden layer shape of inverse network + inverse_hidden_size=512, # (int) How many updates(iterations) to train after collector's one collection. # Bigger "update_per_collect" means bigger off-policy. # collect data -> update policy-> collect data -> ... update_per_collect=100, # (float) The importance weight of the forward and reverse loss. - reverse_scale=1, + reverse_loss_weight=1, # (float) The weight of intrinsic reward. # r = intrinsic_reward_weight * r_i + r_e. intrinsic_reward_weight=0.003, # 1/300 @@ -105,7 +112,10 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert device == "cpu" or device.startswith("cuda") self.device = device self.tb_logger = tb_logger - self.reward_model = ICMNetwork(config.obs_shape, config.hidden_size_list, config.action_shape) + self.reward_model = ICMNetwork( + config.obs_shape, config.hidden_size_list, config.residual_num, config.inverse_hidden_size, + config.action_shape + ) self.reward_model.to(self.device) self.intrinsic_reward_type = config.intrinsic_reward_type assert self.intrinsic_reward_type in ['add', 'new', 'assign'] @@ -114,7 +124,7 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> self.train_next_states = [] self.train_actions = [] self.opt = optim.Adam(self.reward_model.parameters(), config.learning_rate) - self.reverse_scale = config.reverse_scale + self.reverse_loss_weight = config.reverse_loss_weight self.estimate_cnt_icm = 0 self.train_cnt_icm = 0 @@ -128,22 +138,25 @@ def _train(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, float]: data_actions: list = [self.train_actions[i] for i in train_data_index] data_actions: torch.Tensor = torch.cat(data_actions).to(self.device) - inverse_loss, forward_loss, accuracy = self.reward_model.learn(data_states, data_next_states, data_actions) - loss = self.reverse_scale * inverse_loss + forward_loss + # action_accuracy: the accuracy of predicting the action. + inverse_loss, forward_loss, action_accuracy = self.reward_model.learn( + data_states, data_next_states, data_actions + ) + loss = self.reverse_loss_weight * inverse_loss + forward_loss self.opt.zero_grad() loss.backward() self.opt.step() - return loss, inverse_loss, forward_loss, accuracy + return loss, inverse_loss, forward_loss, action_accuracy def train(self) -> None: for _ in range(self.cfg.update_per_collect): - loss, inverse_loss, forward_loss, accuracy = self._train() + loss, inverse_loss, forward_loss, action_accuracy = self._train() self.tb_logger.add_scalar('icm_reward/total_loss', loss, self.train_cnt_icm) self.tb_logger.add_scalar('icm_reward/forward_loss', forward_loss, self.train_cnt_icm) self.tb_logger.add_scalar('icm_reward/inverse_loss', inverse_loss, self.train_cnt_icm) - self.tb_logger.add_scalar('icm_reward/action_accuracy', accuracy, self.train_cnt_icm) + self.tb_logger.add_scalar('icm_reward/action_accuracy', action_accuracy, self.train_cnt_icm) self.train_cnt_icm += 1 def estimate(self, data: list) -> List[Dict]: @@ -161,9 +174,12 @@ def estimate(self, data: list) -> List[Dict]: self.tb_logger.add_scalar('icm_reward/raw_icm_reward_mean', raw_icm_reward.mean(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_min', raw_icm_reward.min(), self.estimate_cnt_icm) self.tb_logger.add_scalar('icm_reward/raw_icm_reward_std', raw_icm_reward.std(), self.estimate_cnt_icm) - icm_reward = (raw_icm_reward - raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) - icm_reward = icm_reward.to(self.device) - train_data_augmented = combine_intrinsic_exterinsic_reward(train_data_augmented, icm_reward, self.cfg) + normalized_icm_reward = (raw_icm_reward - + raw_icm_reward.min()) / (raw_icm_reward.max() - raw_icm_reward.min() + 1e-8) + normalized_icm_reward = normalized_icm_reward.to(self.device) + train_data_augmented = combine_intrinsic_exterinsic_reward( + train_data_augmented, normalized_icm_reward, self.cfg + ) return train_data_augmented diff --git a/ding/reward_model/network.py b/ding/reward_model/network.py index ca77eaeacc..567f395002 100644 --- a/ding/reward_model/network.py +++ b/ding/reward_model/network.py @@ -155,23 +155,29 @@ class ICMNetwork(nn.Module): 3) Predicting the next embedded obs, given the embeded former observation and action """ - def __init__(self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, action_shape: int) -> None: + def __init__( + self, obs_shape: Union[int, SequenceType], hidden_size_list: SequenceType, residual_number: int, + inverse_hidden_size: int, action_shape: int + ) -> None: super(ICMNetwork, self).__init__() self.action_shape = action_shape feature_output = hidden_size_list[-1] self.feature = RepresentationNetwork(obs_shape, hidden_size_list) - self.inverse_net = nn.Sequential(nn.Linear(feature_output * 2, 512), nn.ReLU(), nn.Linear(512, action_shape)) + self.inverse_net = nn.Sequential( + nn.Linear(feature_output * 2, inverse_hidden_size), nn.ReLU(), nn.Linear(inverse_hidden_size, action_shape) + ) + self.residual_number = residual_number self.residual = nn.ModuleList( [ nn.Sequential( nn.Linear(action_shape + 512, 512), nn.LeakyReLU(), nn.Linear(512, 512), - ) for _ in range(8) + ) for _ in range(self.residual_number * 2) ] ) - self.forward_net_1 = RepresentationNetwork(action_shape + feature_output, [512], nn.LeakyReLU()) - self.forward_net_2 = nn.Linear(action_shape + 512, feature_output) + self.forward_net_backbone = RepresentationNetwork(action_shape + feature_output, [512], nn.LeakyReLU()) + self.forward_net_head = nn.Linear(action_shape + 512, feature_output) def _forward(self, state: torch.Tensor, next_state: torch.Tensor, action_long: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -204,25 +210,26 @@ def _forward(self, state: torch.Tensor, next_state: torch.Tensor, - pred_action_logit (:obj:`torch.Tensor`): :math:`(B, A)`, where B is the batch size and A is the ''action_shape'' """ + # use feature network to encode state and next state + # feature network will use to identify whether the state has been seen action = one_hot(action_long, num=self.action_shape) encode_state = self.feature(state) encode_next_state = self.feature(next_state) # get pred action logit concat_state = torch.cat((encode_state, encode_next_state), 1) pred_action_logit = self.inverse_net(concat_state) - # --------------------- # get pred next state pred_next_state_feature_orig = torch.cat((encode_state, action), 1) - pred_next_state_feature_orig = self.forward_net_1(pred_next_state_feature_orig) + pred_next_state_feature_orig = self.forward_net_backbone(pred_next_state_feature_orig) # residual - for i in range(4): + for i in range(self.residual_number): pred_next_state_feature = self.residual[i * 2](torch.cat((pred_next_state_feature_orig, action), 1)) pred_next_state_feature_orig = self.residual[i * 2 + 1]( torch.cat((pred_next_state_feature, action), 1) ) + pred_next_state_feature_orig - pred_next_state_feature = self.forward_net_2(torch.cat((pred_next_state_feature_orig, action), 1)) + pred_next_state_feature = self.forward_net_head(torch.cat((pred_next_state_feature_orig, action), 1)) real_next_state_feature = encode_next_state return real_next_state_feature, pred_next_state_feature, pred_action_logit @@ -323,6 +330,10 @@ def get_outputs_abs_reward(self, traj_i: torch.Tensor, traj_j: torch.Tensor) -> class InverseNetwork(nn.Module): + """ + Overview: + Network used in Episodic reward model for NGU. + """ def __init__( self, obs_shape: Union[int, SequenceType], action_shape: int, hidden_size_list: SequenceType, device: str @@ -340,11 +351,11 @@ def _compute_intrinsic_reward( self, episodic_memory: List, current_controllable_state: torch.Tensor, - k=10, - kernel_cluster_distance=0.008, - kernel_epsilon=0.0001, - c=0.001, - siminarity_max=8, + k: int = 10, + kernel_cluster_distance: float = 0.008, + kernel_epsilon: float = 0.0001, + c: float = 0.001, + siminarity_max: int = 8, ) -> torch.Tensor: # this function is modified from https://github.com/Coac/never-give-up/blob/main/embedding_model.py state_dist = torch.cdist(current_controllable_state.unsqueeze(0), episodic_memory, p=2).squeeze(0).sort()[0][:k] @@ -385,9 +396,6 @@ def forward(self, obs: list, is_null: list) -> Tuple[torch.Tensor, torch.Tensor] for i in range(batch_size): for j in range(seq_length): if j < 10: - # if self._running_mean_std_episodic_reward.mean is not None: - # episodic_reward[i].append(torch.tensor(self._running_mean_std_episodic_reward.mean).to(self.device)) - # else: episodic_reward[i].append(torch.tensor(0.).to(self.device)) elif j: episodic_memory = cur_obs_embedding[i][:j] @@ -418,7 +426,18 @@ def forward(self, obs: list, is_null: list) -> Tuple[torch.Tensor, torch.Tensor] return episodic_reward, episodic_reward_real_mean def learn(self, inputs: Dict) -> torch.Tensor: - # obs: torch.Tensor, next_obs: torch.Tensor, action: torch.Tensor + """ + Overview: + Use observation, next_observation and action to train the inverse model + inputs must contain ['obs', 'next_obs', 'action'] + Arguments: + - obs (:obj:`torch.Tensor`): + The current observation + - next_obs (:obj:`torch.Tensor`): + The next observation + - action (:obj:`torch.Tensor`): + The action + """ cur_obs_embedding = self.embedding_net(inputs['obs']) next_obs_embedding = self.embedding_net(inputs['next_obs']) # get pred action diff --git a/ding/reward_model/ngu_reward_model.py b/ding/reward_model/ngu_reward_model.py index 0092066904..12971e17d4 100644 --- a/ding/reward_model/ngu_reward_model.py +++ b/ding/reward_model/ngu_reward_model.py @@ -95,10 +95,10 @@ def __init__(self, config: EasyDict, device: str, tb_logger: 'SummaryWriter') -> assert self.intrinsic_reward_type in ['add', 'new', 'assign'] self.train_data_total = [] self.train_data = [] - self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.train_cnt_icm = 0 self.estimate_cnt_rnd = 0 self._running_mean_std_rnd = RunningMeanStd(epsilon=1e-4) + self.opt = optim.Adam(self.reward_model.predictor.parameters(), config.learning_rate) self.only_use_last_five_frames = config.only_use_last_five_frames_for_icm_rnd def _train(self) -> torch.Tensor: diff --git a/ding/reward_model/pwil_irl_model.py b/ding/reward_model/pwil_irl_model.py index 465cc16d0f..5ee00eee5b 100644 --- a/ding/reward_model/pwil_irl_model.py +++ b/ding/reward_model/pwil_irl_model.py @@ -7,7 +7,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel -from .reword_model_utils import concat_state_action_pairs +from .reward_model_utils import concat_state_action_pairs def collect_state_action_pairs(iterator): diff --git a/ding/reward_model/red_irl_model.py b/ding/reward_model/red_irl_model.py index f65c284c52..91afbcf145 100644 --- a/ding/reward_model/red_irl_model.py +++ b/ding/reward_model/red_irl_model.py @@ -7,7 +7,7 @@ from ding.utils import REWARD_MODEL_REGISTRY, one_time_warning from .base_reward_model import BaseRewardModel from .network import REDNetwork -from .reword_model_utils import concat_state_action_pairs +from .reward_model_utils import concat_state_action_pairs @REWARD_MODEL_REGISTRY.register('red') diff --git a/ding/reward_model/reword_model_utils.py b/ding/reward_model/reward_model_utils.py similarity index 100% rename from ding/reward_model/reword_model_utils.py rename to ding/reward_model/reward_model_utils.py diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 528c27838a..3d79178f37 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -7,7 +7,7 @@ from ding.utils import REWARD_MODEL_REGISTRY from .base_reward_model import BaseRewardModel -from .reword_model_utils import combine_intrinsic_exterinsic_reward, collect_states, obs_norm +from .reward_model_utils import combine_intrinsic_exterinsic_reward, collect_states, obs_norm from .network import RNDNetwork from ding.utils import RunningMeanStd import numpy as np @@ -166,7 +166,7 @@ def collect_data(self, data: list) -> None: def clear_data(self, iter: int) -> None: assert hasattr( self.cfg, 'clear_buffer_per_iters' - ), "Reward Model does not have clear_buffer_per_iters, Clear failed" + ), "Reward Model does not have clear_buffer_per_iters, if you want to clear buffer, you need to add this attribute in config." if iter % self.cfg.clear_buffer_per_iters == 0: self.train_obs.clear() diff --git a/ding/reward_model/tests/test_reward_model_utils.py b/ding/reward_model/tests/test_reward_model_utils.py index 71eac18a3a..a765969e62 100644 --- a/ding/reward_model/tests/test_reward_model_utils.py +++ b/ding/reward_model/tests/test_reward_model_utils.py @@ -2,7 +2,7 @@ import torch from easydict import EasyDict -from ding.reward_model.reword_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward +from ding.reward_model.reward_model_utils import concat_state_action_pairs, combine_intrinsic_exterinsic_reward @pytest.mark.unittest diff --git a/ding/reward_model/trex_reward_model.py b/ding/reward_model/trex_reward_model.py index b2da1d0c25..2c41431498 100644 --- a/ding/reward_model/trex_reward_model.py +++ b/ding/reward_model/trex_reward_model.py @@ -15,7 +15,7 @@ from ding.utils.data import default_collate from .base_reward_model import BaseRewardModel -from .reword_model_utils import collect_states +from .reward_model_utils import collect_states from .network import TREXNetwork diff --git a/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py b/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py deleted file mode 100644 index b3d70714bd..0000000000 --- a/dizoo/atari/config/serial/montezuma/montezuma_ngu_config.py +++ /dev/null @@ -1,131 +0,0 @@ -from easydict import EasyDict - -collector_env_num = 8 -evaluator_env_num = 8 -nstep = 5 -montezuma_ppo_rnd_config = dict( - exp_name='montezuma_ngu_seed0', - env=dict( - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - n_evaluator_episode=8, - env_id='MontezumaRevengeNoFrameskip-v4', - #'ALE/MontezumaRevenge-v5' is available. But special setting is needed after gym make. - obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy - stop_value=int(1e5), - frame_stack=4, - ), - reward_model=dict( - type='ngu-reward', - policy_nstep=nstep, - collect_env_num=collector_env_num, - rnd_reward_model=dict( - intrinsic_reward_type='add', - learning_rate=0.001, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=0.001, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, # 32*100/64=50 - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', - ), - ), - policy=dict( - cuda=True, - on_policy=False, - priority=True, - priority_IS_weight=True, - discount_factor=0.997, - nstep=nstep, - burnin_step=20, - # (int) is the total length of [sequence sample] minus - # the length of burnin part in [sequence sample], - # i.e., = = + - learn_unroll_len=80, # set this key according to the episode length - model=dict( - obs_shape=[4, 84, 84], - action_shape=18, - encoder_hidden_size_list=[128, 128, 512], - collector_env_num=collector_env_num, - ), - learn=dict( - update_per_collect=8, - batch_size=64, - learning_rate=0.0005, - target_update_theta=0.001, - ), - collect=dict( - # NOTE: It is important that set key traj_len_inf=True here, - # to make sure self._traj_len=INF in serial_sample_collector.py. - # In sequence-based policy, for each collect_env, - # we want to collect data of length self._traj_len=INF - # unless the episode enters the 'done' state. - # In each collect phase, we collect a total of sequence samples. - n_sample=32, - traj_len_inf=True, - env_num=collector_env_num, - ), - eval=dict(env_num=evaluator_env_num, ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.05, - decay=1e5, - ), - replay_buffer=dict( - replay_buffer_size=int(2e3), - # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization - alpha=0.6, - # (Float type) How much correction is used: 0 means no correction while 1 means full correction - beta=0.4, - ) - ), - ), -) -montezuma_ppo_rnd_config = EasyDict(montezuma_ppo_rnd_config) -main_config = montezuma_ppo_rnd_config -montezuma_ppo_rnd_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='ngu'), - reward_model=dict(type='ngu-reward'), -) -montezuma_ppo_rnd_create_config = EasyDict(montezuma_ppo_rnd_create_config) -create_config = montezuma_ppo_rnd_create_config - -if __name__ == "__main__": - from ding.entry import serial_pipeline_reward_model_offpolicy - serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py b/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py deleted file mode 100644 index 42ce53f9cc..0000000000 --- a/dizoo/atari/config/serial/pitfall/pitfall_ngu_config.py +++ /dev/null @@ -1,138 +0,0 @@ -from easydict import EasyDict - -collector_env_num = 32 -evaluator_env_num = 5 -nstep = 5 - -pitfall_ppo_rnd_config = dict( - # Note: - # 1. at least 1e10 timesteps, i.e., 10000 million, the reward may increase, please be patient. - # 2. the larger unroll_lenth and replay buffer size may have better results, but also require more memory. - # exp_name='debug_pitfall_ngu_ul298_er01_n32_rlbs2e4', - # exp_name='debug_pitfall_ngu_ul98_er01_n32_rlbs2e4', - # exp_name='debug_pitfall_ngu_ul40_er01_n32_rlbs2e4', - exp_name='pitfall_ngu_seed0', - env=dict( - collector_env_num=collector_env_num, - evaluator_env_num=evaluator_env_num, - n_evaluator_episode=5, - env_id='PitfallNoFrameskip-v4', - #'ALE/Pitfall-v5' is available. But special setting is needed after gym make. - obs_plus_prev_action_reward=True, # use specific env wrapper for ngu policy - stop_value=int(1e5), - frame_stack=4, - ), - reward_model=dict( - type='ngu-reward', - policy_nstep=nstep, - collect_env_num=collector_env_num, - rnd_reward_model=dict( - intrinsic_reward_type='add', # 'assign' - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='rnd-ngu', - ), - episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. - # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander - # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, - # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the - # original last nonzero extrinsic reward. - # please refer to ngu_reward_model for details. - last_nonzero_reward_rescale=False, - # means the rescale value for the last non-zero reward, only used when last_nonzero_reward_rescale is True - # please refer to ngu_reward_model for details. - last_nonzero_reward_weight=1, - intrinsic_reward_type='add', - learning_rate=1e-4, - obs_shape=[4, 84, 84], - action_shape=18, - batch_size=320, - update_per_collect=10, - only_use_last_five_frames_for_icm_rnd=False, - clear_buffer_per_iters=10, - nstep=nstep, - hidden_size_list=[128, 128, 64], - type='episodic', - ), - ), - policy=dict( - cuda=True, - on_policy=False, - priority=True, - priority_IS_weight=True, - discount_factor=0.997, - nstep=nstep, - burnin_step=20, - # (int) is the total length of [sequence sample] minus - # the length of burnin part in [sequence sample], - # i.e., = = + - learn_unroll_len=80, # set this key according to the episode length - model=dict( - obs_shape=[4, 84, 84], - action_shape=18, - encoder_hidden_size_list=[128, 128, 512], - collector_env_num=collector_env_num, - ), - learn=dict( - update_per_collect=8, - batch_size=64, - learning_rate=0.0005, - target_update_theta=0.001, - ), - collect=dict( - # NOTE: It is important that set key traj_len_inf=True here, - # to make sure self._traj_len=INF in serial_sample_collector.py. - # In sequence-based policy, for each collect_env, - # we want to collect data of length self._traj_len=INF - # unless the episode enters the 'done' state. - # In each collect phase, we collect a total of sequence samples. - n_sample=32, - traj_len_inf=True, - env_num=collector_env_num, - ), - eval=dict(env_num=evaluator_env_num, ), - other=dict( - eps=dict( - type='exp', - start=0.95, - end=0.05, - decay=1e5, - ), - replay_buffer=dict( - replay_buffer_size=int(3e3), - # (Float type) How much prioritization is used: 0 means no prioritization while 1 means full prioritization - alpha=0.6, - # (Float type) How much correction is used: 0 means no correction while 1 means full correction - beta=0.4, - ) - ), - ), -) -pitfall_ppo_rnd_config = EasyDict(pitfall_ppo_rnd_config) -main_config = pitfall_ppo_rnd_config -pitfall_ppo_rnd_create_config = dict( - env=dict( - type='atari', - import_names=['dizoo.atari.envs.atari_env'], - ), - env_manager=dict(type='subprocess'), - policy=dict(type='ngu'), - reward_model=dict(type='ngu-reward'), -) -pitfall_ppo_rnd_create_config = EasyDict(pitfall_ppo_rnd_create_config) -create_config = pitfall_ppo_rnd_create_config - -if __name__ == "__main__": - from ding.entry import serial_pipeline_reward_model_offpolicy - serial_pipeline_reward_model_offpolicy([main_config, create_config], seed=0) diff --git a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py index 12ba7984aa..5c20b0a0c4 100644 --- a/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py +++ b/dizoo/atari/config/serial/pong/pong_gail_dqn_config.py @@ -93,4 +93,4 @@ ) # train reward model - serial_pipeline_reward_model_offpolicy(main_config, create_config) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py index 87b3cbd6a4..af58398a68 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_offppo_config.py @@ -97,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py index 9e989dafe7..991aae76f2 100644 --- a/dizoo/atari/config/serial/pong/pong_trex_sql_config.py +++ b/dizoo/atari/config/serial/pong/pong_trex_sql_config.py @@ -82,4 +82,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py index ec135e4399..9925075f8a 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_dqn_config.py @@ -90,4 +90,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py index 0076f56f26..bb80daa3c4 100644 --- a/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py +++ b/dizoo/atari/config/serial/qbert/qbert_trex_offppo_config.py @@ -96,4 +96,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py index d4688430cf..22c6a4e65a 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_dqn_config.py @@ -92,4 +92,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py index dcc12dd9b5..c75fa79171 100644 --- a/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py +++ b/dizoo/atari/config/serial/spaceinvaders/spaceinvaders_trex_offppo_config.py @@ -99,4 +99,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py index f568a04667..8fca3d69db 100755 --- a/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py +++ b/dizoo/box2d/bipedalwalker/config/bipedalwalker_gail_sac_config.py @@ -100,4 +100,4 @@ ) # train reward model - serial_pipeline_reward_model_offpolicy(main_config, create_config) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py index 17e98559da..d38343fbc7 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_drex_dqn_config.py @@ -120,5 +120,5 @@ del args.cfg[0].policy.collect.n_sample drex_collecting_data(args) serial_pipeline_reward_model_offpolicy( - (main_config, create_config), pretrain_reward=True, cooptrain_reward=False, max_env_step=int(1e7) + (main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False, max_env_step=int(1e7) ) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py index 3e8742c3dd..c992a1263a 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_gail_dqn_config.py @@ -109,4 +109,4 @@ ) # train reward model - serial_pipeline_reward_model_offpolicy(main_config, create_config) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py index ee60926c4c..f19b2bfa71 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_ngu_config.py @@ -31,11 +31,11 @@ type='rnd-ngu', ), episodic_reward_model=dict( - # means if using rescale trick to the last non-zero reward - # when combing extrinsic and intrinsic reward. + # This refers to the use of a rescaling method applied to the final non-zero reward. + # when combing the extrinsic and intrinsic reward. # the rescale trick only used in: - # 1. sparse reward env minigrid, in which the last non-zero reward is a strong positive signal - # 2. the last reward of each episode directly reflects the agent's completion of the task, e.g. lunarlander + # 1. Environments with sparse rewards, such as the MiniGrid, where the final non-zero reward provides a significant positive signal. + # 2. Situations where the last reward of each episode directly corresponds to the agent's task completion, such as in the LunarLander environment. # Note that the ngu intrinsic reward is a positive value (max value is 5), in these envs, # the last non-zero reward should not be overwhelmed by intrinsic rewards, so we need rescale the # original last nonzero extrinsic reward. diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py index 86e0a6f3d5..f3e3fc61cd 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_dqn_config.py @@ -105,4 +105,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py index 88f66fef9f..a0e7ceb024 100644 --- a/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py +++ b/dizoo/box2d/lunarlander/config/lunarlander_trex_offppo_config.py @@ -81,4 +81,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py index f30134f35c..7c617aad85 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_drex_dqn_config.py @@ -111,4 +111,4 @@ args.cfg[0].policy.collect.n_episode = 8 del args.cfg[0].policy.collect.n_sample drex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py index 34363359a9..df23981ecd 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_dqn_config.py @@ -87,4 +87,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py index 71d0ae73a8..b8d7c5887f 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_offppo_config.py @@ -86,4 +86,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py index 3f1ecfc44d..a1b44b5578 100644 --- a/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py +++ b/dizoo/classic_control/cartpole/config/cartpole_trex_onppo_config.py @@ -85,4 +85,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/ant_gail_sac_config.py b/dizoo/mujoco/config/ant_gail_sac_config.py index 7072f38142..7b268a8667 100644 --- a/dizoo/mujoco/config/ant_gail_sac_config.py +++ b/dizoo/mujoco/config/ant_gail_sac_config.py @@ -101,4 +101,4 @@ ) # train reward model - serial_pipeline_reward_model_offpolicy(main_config, create_config) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/ant_trex_onppo_config.py b/dizoo/mujoco/config/ant_trex_onppo_config.py index f3d5e96b75..8bdf8bbb2a 100644 --- a/dizoo/mujoco/config/ant_trex_onppo_config.py +++ b/dizoo/mujoco/config/ant_trex_onppo_config.py @@ -75,6 +75,15 @@ create_config = ant_trex_ppo_create_config if __name__ == "__main__": - # or you can enter `ding -m serial -c ant_trex_onppo_config.py -s 0` - from ding.entry import serial_pipeline_trex_onpolicy - serial_pipeline_trex_onpolicy((main_config, create_config), seed=0) + import argparse + import torch + from ding.entry import trex_collecting_data + from ding.entry import serial_pipeline_reward_model_onpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. + trex_collecting_data(args) + serial_pipeline_reward_model_onpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/ant_trex_sac_config.py b/dizoo/mujoco/config/ant_trex_sac_config.py index 6c0ef73097..63aa6b6808 100644 --- a/dizoo/mujoco/config/ant_trex_sac_config.py +++ b/dizoo/mujoco/config/ant_trex_sac_config.py @@ -85,6 +85,15 @@ create_config = ant_trex_sac_create_config if __name__ == "__main__": - # or you can enter `ding -m serial -c ant_trex_sac_config.py -s 0` - from ding.entry import serial_pipeline_trex - serial_pipeline_trex((main_config, create_config), seed=0) + import argparse + import torch + from ding.entry import trex_collecting_data + from ding.entry import serial_pipeline_reward_model_offpolicy + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='please enter abs path for this file') + parser.add_argument('--seed', type=int, default=0) + parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu') + args = parser.parse_args() + # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. + trex_collecting_data(args) + serial_pipeline_reward_model_offpolicy((main_config, create_config), pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py index be0d6d2ef8..ff62473289 100644 --- a/dizoo/mujoco/config/halfcheetah_gail_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_gail_sac_config.py @@ -100,4 +100,4 @@ ) # train reward model - serial_pipeline_reward_model_offpolicy(main_config, create_config) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py index 83718e6379..867b4381e8 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_onppo_config.py @@ -96,4 +96,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py index 30f9581771..ecacdcf0d4 100644 --- a/dizoo/mujoco/config/halfcheetah_trex_sac_config.py +++ b/dizoo/mujoco/config/halfcheetah_trex_sac_config.py @@ -98,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/hopper_trex_onppo_config.py b/dizoo/mujoco/config/hopper_trex_onppo_config.py index c905374dbf..5e375d668c 100644 --- a/dizoo/mujoco/config/hopper_trex_onppo_config.py +++ b/dizoo/mujoco/config/hopper_trex_onppo_config.py @@ -89,4 +89,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/hopper_trex_sac_config.py b/dizoo/mujoco/config/hopper_trex_sac_config.py index f37d39240c..8dece97ca0 100644 --- a/dizoo/mujoco/config/hopper_trex_sac_config.py +++ b/dizoo/mujoco/config/hopper_trex_sac_config.py @@ -98,4 +98,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/walker2d_gail_ddpg_config.py b/dizoo/mujoco/config/walker2d_gail_ddpg_config.py index 779f65f63b..d4370349dc 100644 --- a/dizoo/mujoco/config/walker2d_gail_ddpg_config.py +++ b/dizoo/mujoco/config/walker2d_gail_ddpg_config.py @@ -87,13 +87,17 @@ # or you can enter `ding -m serial_gail -c walker2d_gail_ddpg_config.py -s 0` # then input the config you used to generate your expert model in the path mentioned above # e.g. walker2d_ddpg_config.py - from ding.entry import serial_pipeline_gail + from ding.entry import serial_pipeline_reward_model_offpolicy, collect_demo_data from dizoo.mujoco.config.walker2d_ddpg_config import walker2d_ddpg_config, walker2d_ddpg_create_config - expert_main_config = walker2d_ddpg_config - expert_create_config = walker2d_ddpg_create_config - serial_pipeline_gail( - [main_config, create_config], [expert_main_config, expert_create_config], - max_env_step=1000000, - seed=0, - collect_data=True + + # set your expert config here + expert_cfg = (walker2d_ddpg_config, walker2d_ddpg_create_config) + expert_data_path = main_config.reward_model.data_path + '/expert_data.pkl' + + # collect expert data + collect_demo_data( + expert_cfg, seed=0, expert_data_path=expert_data_path, collect_count=main_config.reward_model.collect_count ) + + # train reward model + serial_pipeline_reward_model_offpolicy((main_config, create_config)) \ No newline at end of file diff --git a/dizoo/mujoco/config/walker2d_gail_sac_config.py b/dizoo/mujoco/config/walker2d_gail_sac_config.py index a4e053c8cc..a075c32299 100644 --- a/dizoo/mujoco/config/walker2d_gail_sac_config.py +++ b/dizoo/mujoco/config/walker2d_gail_sac_config.py @@ -101,4 +101,4 @@ ) # train reward model - serial_pipeline_reward_model_offpolicy(main_config, create_config) + serial_pipeline_reward_model_offpolicy((main_config, create_config)) diff --git a/dizoo/mujoco/config/walker2d_trex_onppo_config.py b/dizoo/mujoco/config/walker2d_trex_onppo_config.py index fb9e5a930a..bde35d48f3 100644 --- a/dizoo/mujoco/config/walker2d_trex_onppo_config.py +++ b/dizoo/mujoco/config/walker2d_trex_onppo_config.py @@ -88,4 +88,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_onpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) diff --git a/dizoo/mujoco/config/walker2d_trex_sac_config.py b/dizoo/mujoco/config/walker2d_trex_sac_config.py index c0588067e5..998f6dcbeb 100644 --- a/dizoo/mujoco/config/walker2d_trex_sac_config.py +++ b/dizoo/mujoco/config/walker2d_trex_sac_config.py @@ -97,4 +97,4 @@ args = parser.parse_args() # The function ``trex_collecting_data`` below is to collect episodic data for training the reward model in trex. trex_collecting_data(args) - serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward=True, cooptrain_reward=False) + serial_pipeline_reward_model_offpolicy([main_config, create_config], pretrain_reward_model=True, cooptrain_reward_model=False) From 97da5c69c80e0e8aca4a532ffeb957d806886339 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 20 Jun 2023 00:18:46 -0400 Subject: [PATCH 50/52] fix style for rnd and icm --- ding/reward_model/icm_reward_model.py | 6 +++--- ding/reward_model/rnd_reward_model.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index 2cdc864362..f277791bb6 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -52,10 +52,10 @@ class ICMRewardModel(BaseRewardModel): 8 | ``hidden`` list [64, 64, | the MLP layer shape | | ``_size_list`` (int) 128] | | 9 | ``inverse_`` int 512 | the inverse model hidden size | - | ``hidden_size`` - 10 | ``update_per_`` int 100 | Number of updates per collect | + | ``hidden_size`` | | + 10 | ``update_per_`` int 100 | Number of updates per collect | | ``collect`` | | - 11 | ``reverse_loss`` float 1 | the importance weight of the | + 11 | ``reverse_loss`` float 1 | the importance weight of the | ``_weight`` | forward and reverse loss | 12 | ``intrinsic_`` float 0.003 | the weight of intrinsic reward | r = w*r_i + r_e ``reward_weight`` diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 3d79178f37..076a8ccb23 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -166,7 +166,8 @@ def collect_data(self, data: list) -> None: def clear_data(self, iter: int) -> None: assert hasattr( self.cfg, 'clear_buffer_per_iters' - ), "Reward Model does not have clear_buffer_per_iters, if you want to clear buffer, you need to add this attribute in config." + ), "Reward Model does not have clear_buffer_per_iters, \ + if you want to clear buffer, you need to add this attribute in config." if iter % self.cfg.clear_buffer_per_iters == 0: self.train_obs.clear() From 774b2a4fadacea9a32afcf1033e1620ea43e0edb Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 20 Jun 2023 00:43:39 -0400 Subject: [PATCH 51/52] fix style for rnd and icm --- ding/reward_model/rnd_reward_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ding/reward_model/rnd_reward_model.py b/ding/reward_model/rnd_reward_model.py index 076a8ccb23..ca153492ae 100644 --- a/ding/reward_model/rnd_reward_model.py +++ b/ding/reward_model/rnd_reward_model.py @@ -168,6 +168,7 @@ def clear_data(self, iter: int) -> None: self.cfg, 'clear_buffer_per_iters' ), "Reward Model does not have clear_buffer_per_iters, \ if you want to clear buffer, you need to add this attribute in config." + if iter % self.cfg.clear_buffer_per_iters == 0: self.train_obs.clear() From b78e36ce6c88ba0369714a0d5ada03491708c5a7 Mon Sep 17 00:00:00 2001 From: Ruoyu Gao Date: Tue, 20 Jun 2023 00:57:57 -0400 Subject: [PATCH 52/52] fix style for icm --- ding/reward_model/icm_reward_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ding/reward_model/icm_reward_model.py b/ding/reward_model/icm_reward_model.py index f277791bb6..d988a1aed5 100644 --- a/ding/reward_model/icm_reward_model.py +++ b/ding/reward_model/icm_reward_model.py @@ -51,11 +51,11 @@ class ICMRewardModel(BaseRewardModel): 7 | ``residual_num`` int 4 | the residual number of residual net | 8 | ``hidden`` list [64, 64, | the MLP layer shape | | ``_size_list`` (int) 128] | | - 9 | ``inverse_`` int 512 | the inverse model hidden size | + 9 | ``inverse_`` int 512 | the inverse model hidden size | | ``hidden_size`` | | - 10 | ``update_per_`` int 100 | Number of updates per collect | + 10 | ``update_per_`` int 100 | Number of updates per collect | | ``collect`` | | - 11 | ``reverse_loss`` float 1 | the importance weight of the | + 11 | ``reverse_loss`` float 1 | the importance weight of the | ``_weight`` | forward and reverse loss | 12 | ``intrinsic_`` float 0.003 | the weight of intrinsic reward | r = w*r_i + r_e ``reward_weight``