From 41b6905ed2b2567aa06acd9b333fd96592fb6eca Mon Sep 17 00:00:00 2001 From: Troi Williams <40696868+troiwill@users.noreply.github.com> Date: Sun, 24 Mar 2024 13:40:18 -0400 Subject: [PATCH 1/4] Initial implementation of response and response model. --- pomdp_py/algorithms/po_rollout.pxd | 4 +- pomdp_py/algorithms/po_rollout.pyx | 48 +++---- pomdp_py/algorithms/po_uct.pxd | 5 +- pomdp_py/algorithms/po_uct.pyx | 50 ++++--- pomdp_py/algorithms/pomcp.pyx | 4 +- pomdp_py/algorithms/value_iteration.pyx | 4 +- pomdp_py/framework/basics.pxd | 14 +- pomdp_py/framework/basics.pyx | 172 +++++++++++++++++++----- tests/test_response.py | 41 ++++++ 9 files changed, 253 insertions(+), 89 deletions(-) create mode 100644 tests/test_response.py diff --git a/pomdp_py/algorithms/po_rollout.pxd b/pomdp_py/algorithms/po_rollout.pxd index 1c5523ae..8bbd180b 100644 --- a/pomdp_py/algorithms/po_rollout.pxd +++ b/pomdp_py/algorithms/po_rollout.pxd @@ -1,4 +1,4 @@ -from pomdp_py.framework.basics cimport Action, State, Observation, Agent +from pomdp_py.framework.basics cimport Action, State, Observation, Agent, Response from pomdp_py.framework.planner cimport Planner from pomdp_py.algorithms.po_uct cimport RolloutPolicy, ActionPrior @@ -11,7 +11,7 @@ cdef class PORollout(Planner): cdef float _discount_factor cdef bint _particles cdef Agent _agent - cdef float _last_best_reward + cdef Response _last_best_response cpdef _search(self) cpdef _rollout(self, State state, int depth) diff --git a/pomdp_py/algorithms/po_rollout.pyx b/pomdp_py/algorithms/po_rollout.pyx index 324cf3d2..4e23f427 100644 --- a/pomdp_py/algorithms/po_rollout.pyx +++ b/pomdp_py/algorithms/po_rollout.pyx @@ -15,7 +15,7 @@ it will do the rollouts and action selection as described. from pomdp_py.framework.basics cimport Action, Agent, POMDP, State, Observation,\ ObservationModel, TransitionModel, GenerativeDistribution, PolicyModel,\ - sample_generative_model + sample_generative_model, Response from pomdp_py.framework.planner cimport Planner from pomdp_py.representations.distribution.particles cimport Particles from pomdp_py.representations.belief.particles cimport particle_reinvigoration @@ -46,58 +46,60 @@ cdef class PORollout(Planner): self._particles = particles self._agent = None - self._last_best_reward = float('-inf') + self._last_best_response = Response({"reward": float('-inf')}) @property - def last_best_reward(self): - return self._last_best_reward + def last_best_response(self): + return self._last_best_response cpdef public plan(self, Agent agent): self._agent = agent - best_action, best_reward = self._search() - self._last_best_reward = best_reward + best_action, best_response = self._search() + self._last_best_response = best_response return best_action cpdef _search(self): cdef Action best_action - cdef float best_reward, reward_avg, total_discounted_reward + cdef Response best_response = Response() + cdef Response response_avg = Response() + cdef Response total_discounted_response = Response() cdef set legal_actions - cdef list rewards + cdef list responses - best_action, best_reward = None, float("-inf") + best_action, best_response["reward"] = None, float("-inf") legal_actions = self._agent.valid_actions(history=self._agent.history) for action in legal_actions: - rewards = [] + responses = [] for i in range(self._num_sims // len(legal_actions)): state = self._agent.belief.random() - total_discounted_reward = self._rollout(state, 0) - rewards.append(total_discounted_reward) - reward_avg = sum(rewards) / len(rewards) - if reward_avg > best_reward: + total_discounted_response = self._rollout(state, 0) + responses.append(total_discounted_response["reward"]) + response_avg["reward"] = sum(responses) / len(responses) + if response_avg["reward"] > best_response["reward"]: best_action = action - best_reward = reward_avg - return best_action, best_reward + best_response["reward"] = response_avg["reward"] + return best_action, best_response cpdef _rollout(self, State state, int depth): # Rollout without a tree. cdef Action action cdef float discount = 1.0 - cdef float total_discounted_reward = 0 + cdef Response total_discounted_response = Response() cdef State next_state cdef Observation observation - cdef float reward + cdef Response response = Response() cdef int nsteps cdef tuple history = self._agent.history while depth <= self._max_depth: action = self._rollout_policy.rollout(state, history=history) - next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action) + next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action) history = history + ((action, observation),) depth += 1 - total_discounted_reward += reward * discount + total_discounted_response = total_discounted_response + response * discount discount *= self._discount_factor state = next_state - return total_discounted_reward + return total_discounted_response cpdef update(self, Agent agent, Action real_action, Observation real_observation, state_transform_func=None): @@ -110,7 +112,7 @@ cdef class PORollout(Planner): if not isinstance(cur_belief, Particles): raise ValueError("Agent's belief is not in particles.") for state in cur_belief.particles: - next_state, observation, reward, nsteps = sample_generative_model(agent, state, + next_state, observation, response, nsteps = sample_generative_model(agent, state, real_action) if observation == real_observation: new_belief.add(next_state) @@ -128,7 +130,7 @@ cdef class PORollout(Planner): def clear_agent(self): """clear_agent(self)""" self._agent = None # forget about current agent so that can plan for another agent. - self._last_best_reward = float('-inf') + self._last_best_response["reward"] = float('-inf') cpdef set_rollout_policy(self, RolloutPolicy rollout_policy): """ diff --git a/pomdp_py/algorithms/po_uct.pxd b/pomdp_py/algorithms/po_uct.pxd index 6f66fffd..3517d8d7 100644 --- a/pomdp_py/algorithms/po_uct.pxd +++ b/pomdp_py/algorithms/po_uct.pxd @@ -1,5 +1,5 @@ from pomdp_py.framework.planner cimport Planner -from pomdp_py.framework.basics cimport Agent, PolicyModel, Action, State, Observation +from pomdp_py.framework.basics cimport Agent, PolicyModel, Action, State, Observation, Response cdef class TreeNode: cdef public dict children @@ -7,10 +7,11 @@ cdef class TreeNode: cdef public float value cdef class QNode(TreeNode): - pass + cpdef void update(QNode self, Response response) cdef class VNode(TreeNode): cpdef argmax(VNode self) + cpdef void update(VNode self) cdef class RootVNode(VNode): cdef public tuple history diff --git a/pomdp_py/algorithms/po_uct.pyx b/pomdp_py/algorithms/po_uct.pyx index c0f02665..0cbc731c 100644 --- a/pomdp_py/algorithms/po_uct.pyx +++ b/pomdp_py/algorithms/po_uct.pyx @@ -35,7 +35,7 @@ the prior knowledge. from pomdp_py.framework.basics cimport Action, Agent, POMDP, State, Observation,\ ObservationModel, TransitionModel, GenerativeDistribution, PolicyModel,\ - sample_generative_model + sample_generative_model, Response from pomdp_py.framework.planner cimport Planner from pomdp_py.representations.distribution.particles cimport Particles from pomdp_py.utils import typ @@ -64,13 +64,20 @@ cdef class QNode(TreeNode): self.num_visits = num_visits self.value = value self.children = {} # o -> VNode + def __str__(self): return typ.red("QNode") + "(%.3f, %.3f | %s)" % (self.num_visits, self.value, str(self.children.keys())) + def __repr__(self): return self.__str__() + cpdef void update(QNode self, Response response): + self.num_visits += 1 + self.value = self.value + (response["reward"] - self.value) / self.num_visits + + cdef class VNode(TreeNode): def __init__(self, num_visits, **kwargs): self.num_visits = num_visits @@ -98,6 +105,9 @@ cdef class VNode(TreeNode): best_value = self[action].value return best_action + cpdef void update(VNode self): + self.num_visits += 1 + @property def value(self): best_action = max(self.children, key=lambda action: self.children[action].value) @@ -361,7 +371,7 @@ cdef class POUCT(Planner): State state, tuple history, VNode root, QNode parent, Observation observation, int depth): if depth > self._max_depth: - return 0 + return Response() if root is None: if self._agent.tree is None: root = self._VNode(root=True) @@ -373,46 +383,46 @@ cdef class POUCT(Planner): if parent is not None: parent[observation] = root self._expand_vnode(root, history, state=state) - rollout_reward = self._rollout(state, history, root, depth) - return rollout_reward + rollout_response = self._rollout(state, history, root, depth) + return rollout_response cdef int nsteps action = self._ucb(root) - next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action) + next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action) if nsteps == 0: # This indicates the provided action didn't lead to transition # Perhaps the action is not allowed to be performed for the given state # (for example, the state is not in the initiation set of the option, # or the state is a terminal state) - return reward + return response - total_reward = reward + (self._discount_factor**nsteps)*self._simulate(next_state, + total_response = response + (self._discount_factor**nsteps)*self._simulate(next_state, history + ((action, observation),), root[action][observation], root[action], observation, depth+nsteps) - root.num_visits += 1 - root[action].num_visits += 1 - root[action].value = root[action].value + (total_reward - root[action].value) / (root[action].num_visits) - return total_reward + + root.update() + root[action].update(total_response) + return total_response cpdef _rollout(self, State state, tuple history, VNode root, int depth): cdef Action action cdef float discount = 1.0 - cdef float total_discounted_reward = 0 + cdef Response total_discounted_response = Response() cdef State next_state cdef Observation observation - cdef float reward + cdef Response response = Response() while depth < self._max_depth: action = self._rollout_policy.rollout(state, history) - next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action) + next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action) history = history + ((action, observation),) depth += nsteps - total_discounted_reward += reward * discount + total_discounted_response = total_discounted_response + response * discount discount *= (self._discount_factor**nsteps) state = next_state - return total_discounted_reward + return total_discounted_response cpdef Action _ucb(self, VNode root): """UCB1""" @@ -436,15 +446,15 @@ cdef class POUCT(Planner): ''' cdef State next_state cdef Observation observation - cdef float reward + cdef Response response if self._agent.transition_model is None: - next_state, observation, reward = self._agent.generative_model.sample(state, action) + next_state, observation, response = self._agent.generative_model.sample(state, action) else: next_state = self._agent.transition_model.sample(state, action) observation = self._agent.observation_model.sample(next_state, action) - reward = self._agent.reward_model.sample(state, action, next_state) - return next_state, observation, reward + response = self._agent.response_model.sample(state, action, next_state) + return next_state, observation, response def _VNode(self, root=False, **kwargs): """Returns a VNode with default values; The function naming makes it clear diff --git a/pomdp_py/algorithms/pomcp.pyx b/pomdp_py/algorithms/pomcp.pyx index 349b8127..52804cd6 100644 --- a/pomdp_py/algorithms/pomcp.pyx +++ b/pomdp_py/algorithms/pomcp.pyx @@ -128,10 +128,10 @@ cdef class POMCP(POUCT): cpdef _simulate(POMCP self, State state, tuple history, VNode root, QNode parent, Observation observation, int depth): - total_reward = POUCT._simulate(self, state, history, root, parent, observation, depth) + total_response = POUCT._simulate(self, state, history, root, parent, observation, depth) if depth == 1 and root is not None: root.belief.add(state) # belief update happens as simulation goes. - return total_reward + return total_response def _VNode(self, root=False, **kwargs): """Returns a VNode with default values; The function naming makes it clear diff --git a/pomdp_py/algorithms/value_iteration.pyx b/pomdp_py/algorithms/value_iteration.pyx index 680e083e..331194b1 100644 --- a/pomdp_py/algorithms/value_iteration.pyx +++ b/pomdp_py/algorithms/value_iteration.pyx @@ -48,8 +48,8 @@ cdef class _PolicyTreeNode: subtree_value = self.children[o].values[sp] # corresponds to V_{oi(p)} in paper else: subtree_value = 0.0 - reward = self._agent.reward_model.sample(s, self.action, sp) - expected_future_value += trans_prob * obsrv_prob * (reward + discount_factor*subtree_value) + response = self._agent.response_model.sample(s, self.action, sp) + expected_future_value += trans_prob * obsrv_prob * (response["reward"] + discount_factor*subtree_value) values[s] = expected_future_value return values diff --git a/pomdp_py/framework/basics.pxd b/pomdp_py/framework/basics.pxd index b3824538..038de4cc 100644 --- a/pomdp_py/framework/basics.pxd +++ b/pomdp_py/framework/basics.pxd @@ -8,6 +8,10 @@ cdef class TransitionModel: pass cdef class PolicyModel: pass + +cdef class ResponseModel(dict): + pass + cdef class BlackboxModel: pass cdef class RewardModel: @@ -27,6 +31,12 @@ cdef class State: cdef class Observation: pass +cdef class Vector(list): + pass + +cdef class Response(dict): + pass + cdef class Agent: cdef GenerativeDistribution _init_belief cdef PolicyModel _policy_model @@ -41,7 +51,7 @@ cdef class Agent: cdef class Environment: cdef State _init_state cdef TransitionModel _transition_model - cdef RewardModel _reward_model + cdef ResponseModel _response_model cdef BlackboxModel _blackbox_model cdef State _cur_state @@ -49,4 +59,4 @@ cdef class Option(Action): pass cpdef sample_generative_model(Agent agent, State state, Action action, float discount_factor=*) -cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, State state, Action a, float discount_factor=*) +cpdef sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action a, float discount_factor=*) diff --git a/pomdp_py/framework/basics.pyx b/pomdp_py/framework/basics.pyx index d53c0b35..b8f24a75 100644 --- a/pomdp_py/framework/basics.pyx +++ b/pomdp_py/framework/basics.pyx @@ -186,6 +186,39 @@ cdef class RewardModel: Returns the underlying distribution of the model""" raise NotImplementedError +cdef class ResponseModel(dict): + """A ResponseModel returns a real or simulated response + after the agent interacts with the real or a simulated environment. + The implementation of this model contains a collection of more + specific models such as reward and cost models.""" + + def __init__(self, models): + if not isinstance(models, dict): + raise TypeError("models must be a dictionary of models.") + for key, model in models.items(): + if not hasattr(model, "sample"): + raise NotImplementedError(f"Model named {key} must implement a sample function.") + self[key] = model + + def sample(self, state, action, next_state, **kwargs): + """sample(self, state, action, next_state) + Returns a randomly sampled response according to the + distribution of the internal models. + + Args: + state (~pomdp_py.framework.basics.State): the next state :math:`s` + action (~pomdp_py.framework.basics.Action): the action :math:`a` + next_state (State): the next state :math:`s'` + Returns: + Response: the response + """ + return Response( + dict([ + (name, model.sample(state, action, next_state, **kwargs)) + for name, model in self.items() + ]) + ) + cdef class BlackboxModel: """ A BlackboxModel is the generative distribution :math:`G(s,a)` @@ -317,33 +350,100 @@ cdef class Observation: def __ne__(self, other): return not self.__eq__(other) +cdef class Vector(list): + def __init__(self, values=list()): + if not isinstance(values, list): + raise TypeError(f"values must be type list, but got {type(values)}.") + for v in values: + self.append(v) + + def __eq__(self, other): + if not isinstance(other, (Vector, list)): + raise TypeError(f"other must be type Vector or list, but got {type(other)}.") + return len(self) == len(other) and all(v0 == v1 for v0, v1 in zip(self, other)) + + def __add__(self, other): + if isinstance(other, (float, int)): + vec = [other] * len(self) + elif isinstance(other, Vector): + vec = other + else: + raise TypeError(f"other must be type Vector, float, or int, but got {type(other)}.") + return Vector([v0 + v1 for v0, v1 in zip(self, vec)]) + + def __mul__(self, other): + if not isinstance(other, (float, int)): + raise TypeError(f"other must be type float or int, but got {type(other)}.") + return Vector([v * other for v in self]) + +cdef class Response(dict): + """ + The Response class. + """ + def __init__(self, variables=dict(reward=0.0)): + super().__init__() + if not isinstance(variables, dict): + raise TypeError(f"reward must be type dict, but got {type(variables)}.") + for k, v in variables.items(): + self[k] = v + + def __add__(self, other): + if not isinstance(other, Response): + raise TypeError("other must be type Response.") + return Response( + dict([ + (name, value + other[name]) + for name, value in self.items() + ]) + ) + + def __radd__(self, other): + return self.__add__(other) + + def __mul__(self, other): + if not isinstance(other, (float, int)): + raise TypeError("other must be type float or int.") + return Response( + dict([ + (name, value * other) + for name, value in self.items() + ]) + ) + + def __rmul__(self, other): + return self.__mul__(other) + + def __str__(self): + return ", ".join([f"{k}={v}" for k, v in self.items()]) + + cdef class Agent: """ An Agent operates in an environment by taking actions, receiving observations, and updating its belief. Taking actions is the job of a planner (:class:`Planner`), and the belief update is the job taken care of by the belief representation or the planner. But, the Agent supplies the - :class:`TransitionModel`, :class:`ObservationModel`, :class:`RewardModel`, + :class:`TransitionModel`, :class:`ObservationModel`, :class:`ResponseModel`, OR :class:`BlackboxModel` to the planner or the belief update algorithm. __init__(self, init_belief, policy_model, transition_model=None, observation_model=None, - reward_model=None, + response_model=None, blackbox_model=None) """ def __init__(self, init_belief, policy_model=None, transition_model=None, observation_model=None, - reward_model=None, + response_model=None, blackbox_model=None): self._init_belief = init_belief self._policy_model = policy_model self._transition_model = transition_model self._observation_model = observation_model - self._reward_model = reward_model + self._response_model = response_model self._blackbox_model = blackbox_model # For online planning @@ -399,8 +499,8 @@ cdef class Agent: return self._transition_model @property - def reward_model(self): - return self._reward_model + def response_model(self): + return self._response_model @property def policy_model(self): @@ -415,14 +515,14 @@ cdef class Agent: return self.blackbox_model def set_models(self, transition_model=None, observation_model=None, - reward_model=None, blackbox_model=None, policy_model=None): + response_model=None, blackbox_model=None, policy_model=None): """Re-assign the models to be the ones given.""" if transition_model is not None: self._transition_model = transition_model if observation_model is not None: self._observation_model = observation_model - if reward_model is not None: - self._reward_model = reward_model + if response_model is not None: + self._response_model = response_model if blackbox_model is not None: self._blackbox_model = blackbox_model if policy_model is not None: @@ -478,17 +578,17 @@ cdef class Environment: __init__(self, init_state, transition_model=None, - reward_model=None, + response_model=None, blackbox_model=None) """ def __init__(self, init_state, transition_model=None, - reward_model=None, + response_model=None, blackbox_model=None): self._init_state = init_state self._cur_state = init_state self._transition_model = transition_model - self._reward_model = reward_model + self._response_model = response_model self._blackbox_model = blackbox_model @property @@ -507,21 +607,21 @@ cdef class Environment: return self._transition_model @property - def reward_model(self): - """The :class:`RewardModel` underlying the environment""" - return self._reward_model + def response_model(self): + """The :class:`ResponseModel` underlying the environment""" + return self._response_model @property def blackbox_model(self): """The :class:`BlackboxModel` underlying the environment""" return self._blackbox_model - def set_models(self, transition_model=None, reward_model=None, blackbox_model=None): + def set_models(self, transition_model=None, response_model=None, blackbox_model=None): """Re-assign the models to be the ones given.""" if transition_model is not None: self._transition_model = transition_model - if reward_model is not None: - self._reward_model = reward_model + if response_model is not None: + self._response_model = response_model if blackbox_model is not None: self._blackbox_model = blackbox_model @@ -538,17 +638,17 @@ cdef class Environment: factor when executing actions following an option's policy until reaching terminal condition. Returns: - float or tuple: reward as a result of `action` and state transition, if `execute` is True - (next_state, reward) if `execute` is False. + Response or tuple: response as a result of `action` and state transition, if `execute` is True + (next_state, response) if `execute` is False. """ - next_state, reward, _ = sample_explict_models(self.transition_model, None, self.reward_model, + next_state, response, _ = sample_explict_models(self.transition_model, None, self.response_model, self.state, action, discount_factor=discount_factor) if execute: self.apply_transition(next_state) - return reward + return response else: - return next_state, reward + return next_state, response def apply_transition(self, next_state): """ @@ -558,9 +658,9 @@ cdef class Environment: self._cur_state = next_state def execute(self, action, observation_model): - reward = self.state_transition(action, execute=True) + response = self.state_transition(action, execute=True) observation = self.provide_observation(observation_model, action) - return (observation, reward) + return (observation, response) def provide_observation(self, observation_model, action): """ @@ -652,21 +752,21 @@ cpdef sample_generative_model(Agent agent, State state, Action action, float dis else: result = sample_explict_models(agent.transition_model, agent.observation_model, - agent.reward_model, + agent.response_model, state, action, discount_factor) return result -cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, +cpdef sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action action, float discount_factor=1.0): """ - sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, State state, Action action, float discount_factor=1.0) + sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action action, float discount_factor=1.0) """ cdef State next_state cdef Observation observation - cdef float reward + cdef Response response = Response() cdef Option option cdef int nsteps = 0 @@ -682,17 +782,17 @@ cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R # action will lead to no state change, no observation, and 0 reward, # because nothing happened. if O is not None: - return state, None, 0, 0 + return state, None, 0, response else: - return state, 0, 0 + return state, 0, response - reward = 0 + # response = 0 step_discount_factor = 1.0 while not option.termination(state): action = option.sample(state) next_state = T.sample(state, action) # For now, we don't care about intermediate observations (future work?). - reward += step_discount_factor * R.sample(state, action, next_state) + response = response + step_discount_factor * R.sample(state, action, next_state) step_discount_factor *= discount_factor state = next_state nsteps += 1 @@ -700,10 +800,10 @@ cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R # (doesn't quite make sense to just use option as the action at this point.) else: next_state = T.sample(state, action) - reward = R.sample(state, action, next_state) + response = R.sample(state, action, next_state) nsteps += 1 if O is not None: observation = O.sample(next_state, action) - return next_state, observation, reward, nsteps + return next_state, observation, response, nsteps else: - return next_state, reward, nsteps + return next_state, response, nsteps diff --git a/tests/test_response.py b/tests/test_response.py new file mode 100644 index 00000000..f34da7c3 --- /dev/null +++ b/tests/test_response.py @@ -0,0 +1,41 @@ +from pomdp_py.framework.basics import Response, Vector + +description = "testing framework basics response" + + +def test_assign(): + r = Response() + assert r["reward"] == 0.0 + + r = Response({"reward": 34.0, "cost": Vector([12.0, 53.0])}) + assert r["reward"] == 34.0 + assert r["cost"] == [12.0, 53.0] + + +def test_add(): + r = Response() + r = r + Response({"reward": 42.0}) + assert r["reward"] == 42.0 + + r = Response({"reward": 42.0, "cost": Vector([4.0, 9.0])}) + r = r + Response({"reward": 2.0, "cost": Vector([1.0, 2.0])}) + assert r["reward"] == 44.0 + assert r["cost"] == Vector([5.0, 11.0]) + + +def test_multiply(): + r = Response({"reward": 1.0, "cost": Vector([3.5, 6.2, 9.1])}) + r = r * 1000.0 + assert r["reward"] == 1000.0 + assert r["cost"] == [3500.0, 6200.0, 9100.0] + + +def run(): + test_assign() + test_add() + test_multiply() + + +if __name__ == "__main__": + run() + \ No newline at end of file From 6058dbe0251711d96b574d7db2239976d0280aa2 Mon Sep 17 00:00:00 2001 From: Troi Williams <40696868+troiwill@users.noreply.github.com> Date: Sun, 24 Mar 2024 13:41:41 -0400 Subject: [PATCH 2/4] Updated pomdp-py problems to use response model. --- pomdp_py/problems/load_unload/load_unload.py | 7 ++++--- .../problems/multi_object_search/agent/agent.py | 2 +- pomdp_py/problems/multi_object_search/env/env.py | 9 +++++---- .../problems/rocksample/rocksample_problem.py | 16 ++++++++-------- pomdp_py/problems/tag/agent/agent.py | 2 +- pomdp_py/problems/tag/env/env.py | 2 +- pomdp_py/problems/tag/problem.py | 8 ++++---- pomdp_py/problems/tiger/tiger_problem.py | 6 +++--- 8 files changed, 27 insertions(+), 25 deletions(-) diff --git a/pomdp_py/problems/load_unload/load_unload.py b/pomdp_py/problems/load_unload/load_unload.py index 197ea5db..a0071de9 100644 --- a/pomdp_py/problems/load_unload/load_unload.py +++ b/pomdp_py/problems/load_unload/load_unload.py @@ -220,10 +220,10 @@ def __init__(self, init_state, init_belief): LUPolicyModel(), LUTransitionModel(), LUObservationModel(), - LURewardModel(), + pomdp_py.ResponseModel({"reward": LURewardModel()}), ) - env = pomdp_py.Environment(init_state, LUTransitionModel(), LURewardModel()) + env = pomdp_py.Environment(init_state, LUTransitionModel(), pomdp_py.ResponseModel({"reward": LURewardModel()})) super().__init__(agent, env, name="LoadUnloadProblem") @@ -267,7 +267,8 @@ def update(t): print("==== Step %d ====" % (t + 1)) action = planner.plan(load_unload_problem.agent) - env_reward = load_unload_problem.env.state_transition(action, execute=True) + env_response = load_unload_problem.env.state_transition(action, execute=True) + env_reward = env_response["reward"] true_state = copy.deepcopy(load_unload_problem.env.state) real_observation = load_unload_problem.env.provide_observation( diff --git a/pomdp_py/problems/multi_object_search/agent/agent.py b/pomdp_py/problems/multi_object_search/agent/agent.py index b1525706..0082cd1d 100644 --- a/pomdp_py/problems/multi_object_search/agent/agent.py +++ b/pomdp_py/problems/multi_object_search/agent/agent.py @@ -60,7 +60,7 @@ def __init__( policy_model, transition_model=transition_model, observation_model=observation_model, - reward_model=reward_model, + response_model=pomdp_py.ResponseModel({"reward": reward_model}), ) def clear_history(self): diff --git a/pomdp_py/problems/multi_object_search/env/env.py b/pomdp_py/problems/multi_object_search/env/env.py index da4fce30..3b1ed7e7 100644 --- a/pomdp_py/problems/multi_object_search/env/env.py +++ b/pomdp_py/problems/multi_object_search/env/env.py @@ -33,7 +33,7 @@ def __init__(self, dim, init_state, sensors, obstacles=set({})): if not isinstance(init_state.object_states[objid], RobotState) } reward_model = GoalRewardModel(self.target_objects) - super().__init__(init_state, transition_model, reward_model) + super().__init__(init_state, transition_model, pomdp_py.ResponseModel({"reward": reward_model})) @property def robot_ids(self): @@ -52,8 +52,8 @@ def state_transition(self, action, execute=True, robot_id=None): become the current state. Returns: - float or tuple: reward as a result of `action` and state - transition, if `execute` is True (next_state, reward) if `execute` + Response or tuple: response as a result of `action` and state + transition, if `execute` is True (next_state, response) if `execute` is False. """ @@ -66,9 +66,10 @@ def state_transition(self, action, execute=True, robot_id=None): self.state, action ) - reward = self.reward_model.sample( + response = self.response_model.sample( self.state, action, next_state, robot_id=robot_id ) + reward = response["reward"] if execute: self.apply_transition(next_state) return reward diff --git a/pomdp_py/problems/rocksample/rocksample_problem.py b/pomdp_py/problems/rocksample/rocksample_problem.py index 2980af5a..fd62d064 100644 --- a/pomdp_py/problems/rocksample/rocksample_problem.py +++ b/pomdp_py/problems/rocksample/rocksample_problem.py @@ -439,12 +439,12 @@ def __init__( RSPolicyModel(n, k), RSTransitionModel(n, rock_locs, self.in_exit_area), RSObservationModel(rock_locs, half_efficiency_dist=half_efficiency_dist), - RSRewardModel(rock_locs, self.in_exit_area), + pomdp_py.ResponseModel({"reward": RSRewardModel(rock_locs, self.in_exit_area)}), ) env = pomdp_py.Environment( init_state, RSTransitionModel(n, rock_locs, self.in_exit_area), - RSRewardModel(rock_locs, self.in_exit_area), + pomdp_py.ResponseModel({"reward": RSRewardModel(rock_locs, self.in_exit_area)}), ) self._rock_locs = rock_locs super().__init__(agent, env, name="RockSampleProblem") @@ -461,7 +461,7 @@ def test_planner(rocksample, planner, nsteps=3, discount=0.95): # max_depth=5, anonymize=False) true_state = copy.deepcopy(rocksample.env.state) - env_reward = rocksample.env.state_transition(action, execute=True) + env_response = rocksample.env.state_transition(action, execute=True) true_next_state = copy.deepcopy(rocksample.env.state) real_observation = rocksample.env.provide_observation( @@ -469,20 +469,20 @@ def test_planner(rocksample, planner, nsteps=3, discount=0.95): ) rocksample.agent.update_history(action, real_observation) planner.update(rocksample.agent, action, real_observation) - total_reward += env_reward - total_discounted_reward += env_reward * gamma + total_reward += env_response["reward"] + total_discounted_reward += env_response["reward"] * gamma gamma *= discount print("True state: %s" % true_state) print("Action: %s" % str(action)) print("Observation: %s" % str(real_observation)) - print("Reward: %s" % str(env_reward)) + print("Reward: %s" % str(env_response["reward"])) print("Reward (Cumulative): %s" % str(total_reward)) print("Reward (Cumulative Discounted): %s" % str(total_discounted_reward)) if isinstance(planner, pomdp_py.POUCT): print("__num_sims__: %d" % planner.last_num_sims) print("__plan_time__: %.5f" % planner.last_planning_time) if isinstance(planner, pomdp_py.PORollout): - print("__best_reward__: %d" % planner.last_best_reward) + print("__best_reward__: %d" % planner.last_best_response["reward"]) print("World:") rocksample.print_state() @@ -537,7 +537,7 @@ def create_instance(n, k, **kwargs): def main(): - rocksample = debug_instance() # create_instance(7, 8) + rocksample = create_instance(7, 8) rocksample.print_state() print("*** Testing POMCP ***") diff --git a/pomdp_py/problems/tag/agent/agent.py b/pomdp_py/problems/tag/agent/agent.py index 1a166d9b..0932f092 100644 --- a/pomdp_py/problems/tag/agent/agent.py +++ b/pomdp_py/problems/tag/agent/agent.py @@ -118,7 +118,7 @@ def __init__(self, init_belief, grid_map, pr_stay=0.2, small=1, big=10): policy_model, transition_model=transition_model, observation_model=observation_model, - reward_model=reward_model, + response_model=pomdp_py.ResponseModel({"reward": reward_model}), ) def clear_history(self): diff --git a/pomdp_py/problems/tag/env/env.py b/pomdp_py/problems/tag/env/env.py index 47211aff..c5462e0e 100644 --- a/pomdp_py/problems/tag/env/env.py +++ b/pomdp_py/problems/tag/env/env.py @@ -14,7 +14,7 @@ def __init__(self, init_state, grid_map, pr_stay=0.2, small=1, big=10): target_motion_policy = TagTargetMotionPolicy(grid_map, pr_stay) transition_model = TagTransitionModel(grid_map, target_motion_policy) reward_model = TagRewardModel(small=small, big=big) - super().__init__(init_state, transition_model, reward_model) + super().__init__(init_state, transition_model, pomdp_py.ResponseModel({"reward": reward_model})) @property def width(self): diff --git a/pomdp_py/problems/tag/problem.py b/pomdp_py/problems/tag/problem.py index a158af64..b6660dc2 100644 --- a/pomdp_py/problems/tag/problem.py +++ b/pomdp_py/problems/tag/problem.py @@ -87,7 +87,7 @@ def solve( break # no more time to update. # Execute action - reward = problem.env.state_transition(real_action, execute=True) + response = problem.env.state_transition(real_action, execute=True) # Receive observation _start = time.time() @@ -104,13 +104,13 @@ def solve( _time_used += time.time() - _start # Info and render - _total_reward += reward - _total_discounted_reward += reward * _discount + _total_reward += response["reward"] + _total_discounted_reward += response["reward"] * _discount _discount = _discount * discount_factor print("==== Step %d ====" % (i + 1)) print("Action: %s" % str(real_action)) print("Observation: %s" % str(real_observation)) - print("Reward: %s" % str(reward)) + print("Reward: %s" % str(response["reward"])) print("Reward (Cumulative): %s" % str(_total_reward)) print("Reward (Discounted): %s" % str(_total_discounted_reward)) print("Find Actions Count: %d" % _find_actions_count) diff --git a/pomdp_py/problems/tiger/tiger_problem.py b/pomdp_py/problems/tiger/tiger_problem.py index 67a378ba..965e6344 100644 --- a/pomdp_py/problems/tiger/tiger_problem.py +++ b/pomdp_py/problems/tiger/tiger_problem.py @@ -217,9 +217,9 @@ def __init__(self, obs_noise, init_true_state, init_belief): PolicyModel(), TransitionModel(), ObservationModel(obs_noise), - RewardModel(), + pomdp_py.ResponseModel({"reward": RewardModel()}), ) - env = pomdp_py.Environment(init_true_state, TransitionModel(), RewardModel()) + env = pomdp_py.Environment(init_true_state, TransitionModel(), pomdp_py.ResponseModel({"reward": RewardModel()})) super().__init__(agent, env, name="TigerProblem") @staticmethod @@ -273,7 +273,7 @@ def test_planner(tiger_problem, planner, nsteps=3, debug_tree=False): # in real world); In that case, you could skip # the state transition and re-estimate the state # (e.g. through the perception stack on the robot). - reward = tiger_problem.env.reward_model.sample( + reward = tiger_problem.env.response_model["reward"].sample( tiger_problem.env.state, action, None ) print("Reward:", reward) From 4cdcf60cc333501f19a71fa8541e01a8e8147897 Mon Sep 17 00:00:00 2001 From: Troi Williams <40696868+troiwill@users.noreply.github.com> Date: Sun, 24 Mar 2024 23:47:19 -0400 Subject: [PATCH 3/4] Simplified the response variable and the model. --- pomdp_py/algorithms/po_rollout.pyx | 24 +-- pomdp_py/algorithms/po_uct.pyx | 8 +- pomdp_py/algorithms/value_iteration.pyx | 2 +- pomdp_py/framework/basics.pxd | 10 +- pomdp_py/framework/basics.pyx | 139 +++++++++++++----- pomdp_py/problems/load_unload/load_unload.py | 9 +- .../multi_object_search/agent/agent.py | 2 +- .../problems/multi_object_search/env/env.py | 4 +- .../problems/rocksample/rocksample_problem.py | 13 +- pomdp_py/problems/tag/agent/agent.py | 2 +- pomdp_py/problems/tag/env/env.py | 2 +- pomdp_py/problems/tag/problem.py | 6 +- pomdp_py/problems/tiger/tiger_problem.py | 9 +- tests/test_response.py | 25 ++-- 14 files changed, 162 insertions(+), 93 deletions(-) diff --git a/pomdp_py/algorithms/po_rollout.pyx b/pomdp_py/algorithms/po_rollout.pyx index 4e23f427..d8f3cbe5 100644 --- a/pomdp_py/algorithms/po_rollout.pyx +++ b/pomdp_py/algorithms/po_rollout.pyx @@ -46,7 +46,7 @@ cdef class PORollout(Planner): self._particles = particles self._agent = None - self._last_best_response = Response({"reward": float('-inf')}) + self._last_best_response = None @property def last_best_response(self): @@ -60,24 +60,24 @@ cdef class PORollout(Planner): cpdef _search(self): cdef Action best_action - cdef Response best_response = Response() - cdef Response response_avg = Response() - cdef Response total_discounted_response = Response() + cdef Response best_response + cdef Response response_avg + cdef Response total_discounted_response cdef set legal_actions cdef list responses - best_action, best_response["reward"] = None, float("-inf") + best_action, best_response = None, Response(float("-inf")) legal_actions = self._agent.valid_actions(history=self._agent.history) for action in legal_actions: responses = [] for i in range(self._num_sims // len(legal_actions)): state = self._agent.belief.random() total_discounted_response = self._rollout(state, 0) - responses.append(total_discounted_response["reward"]) - response_avg["reward"] = sum(responses) / len(responses) - if response_avg["reward"] > best_response["reward"]: + responses.append(total_discounted_response) + response_avg = sum(responses) / len(responses) + if response_avg > best_response: best_action = action - best_response["reward"] = response_avg["reward"] + best_response = response_avg return best_action, best_response cpdef _rollout(self, State state, int depth): @@ -87,7 +87,7 @@ cdef class PORollout(Planner): cdef Response total_discounted_response = Response() cdef State next_state cdef Observation observation - cdef Response response = Response() + cdef Response response cdef int nsteps cdef tuple history = self._agent.history @@ -130,8 +130,8 @@ cdef class PORollout(Planner): def clear_agent(self): """clear_agent(self)""" self._agent = None # forget about current agent so that can plan for another agent. - self._last_best_response["reward"] = float('-inf') - + self._last_best_response = Response(float('-inf')) + cpdef set_rollout_policy(self, RolloutPolicy rollout_policy): """ set_rollout_policy(self, RolloutPolicy rollout_policy) diff --git a/pomdp_py/algorithms/po_uct.pyx b/pomdp_py/algorithms/po_uct.pyx index 0cbc731c..838360c4 100644 --- a/pomdp_py/algorithms/po_uct.pyx +++ b/pomdp_py/algorithms/po_uct.pyx @@ -75,7 +75,7 @@ cdef class QNode(TreeNode): cpdef void update(QNode self, Response response): self.num_visits += 1 - self.value = self.value + (response["reward"] - self.value) / self.num_visits + self.value = self.value + (response.reward - self.value) / self.num_visits cdef class VNode(TreeNode): @@ -371,7 +371,7 @@ cdef class POUCT(Planner): State state, tuple history, VNode root, QNode parent, Observation observation, int depth): if depth > self._max_depth: - return Response() + return self._agent.response_model.create_response() if root is None: if self._agent.tree is None: root = self._VNode(root=True) @@ -409,10 +409,10 @@ cdef class POUCT(Planner): cpdef _rollout(self, State state, tuple history, VNode root, int depth): cdef Action action cdef float discount = 1.0 - cdef Response total_discounted_response = Response() + cdef Response total_discounted_response = self._agent.response_model.create_response() cdef State next_state cdef Observation observation - cdef Response response = Response() + cdef Response response while depth < self._max_depth: action = self._rollout_policy.rollout(state, history) diff --git a/pomdp_py/algorithms/value_iteration.pyx b/pomdp_py/algorithms/value_iteration.pyx index 331194b1..52a3f8ea 100644 --- a/pomdp_py/algorithms/value_iteration.pyx +++ b/pomdp_py/algorithms/value_iteration.pyx @@ -49,7 +49,7 @@ cdef class _PolicyTreeNode: else: subtree_value = 0.0 response = self._agent.response_model.sample(s, self.action, sp) - expected_future_value += trans_prob * obsrv_prob * (response["reward"] + discount_factor*subtree_value) + expected_future_value += trans_prob * obsrv_prob * (response.reward + discount_factor*subtree_value) values[s] = expected_future_value return values diff --git a/pomdp_py/framework/basics.pxd b/pomdp_py/framework/basics.pxd index 038de4cc..70f1a5ad 100644 --- a/pomdp_py/framework/basics.pxd +++ b/pomdp_py/framework/basics.pxd @@ -9,8 +9,10 @@ cdef class TransitionModel: cdef class PolicyModel: pass -cdef class ResponseModel(dict): - pass +cdef class ResponseModel: + cdef dict _model_dict + cdef Response _response + cdef dict __dict__ cdef class BlackboxModel: pass @@ -34,8 +36,8 @@ cdef class Observation: cdef class Vector(list): pass -cdef class Response(dict): - pass +cdef class Response: + cdef float _reward cdef class Agent: cdef GenerativeDistribution _init_belief diff --git a/pomdp_py/framework/basics.pyx b/pomdp_py/framework/basics.pyx index b8f24a75..c4cb0d62 100644 --- a/pomdp_py/framework/basics.pyx +++ b/pomdp_py/framework/basics.pyx @@ -186,19 +186,50 @@ cdef class RewardModel: Returns the underlying distribution of the model""" raise NotImplementedError -cdef class ResponseModel(dict): +cdef class ResponseModel: """A ResponseModel returns a real or simulated response after the agent interacts with the real or a simulated environment. The implementation of this model contains a collection of more specific models such as reward and cost models.""" - - def __init__(self, models): - if not isinstance(models, dict): - raise TypeError("models must be a dictionary of models.") - for key, model in models.items(): + def __init__(self, response): + self._model_dict = dict() + self._response = response + + @staticmethod + def generate_response_model(model_dict, response=Response()): + # Do a sanity check to ensure the response model and response are compatible. + for name in model_dict.keys(): + if not hasattr(response, name): + raise AttributeError(f"The response {type(response)} does not have the attribute {name}.") + + # Create the response model and add the models. + model = ResponseModel(response) + model.add_models(model_dict) + return model + + def add_attrs(self, attr_dict): + if not isinstance(attr_dict, dict): + raise TypeError(f"attr_dict must be type dict, but got {type(attr_dict)}.") + + for ak, av in attr_dict.items(): + if hasattr(self, ak): + raise KeyError(f"The attribute {ak} already exists.") + setattr(self, ak, None) + + def add_models(self, model_dict): + if not isinstance(model_dict, dict): + raise TypeError(f"model_dict must be type dict, but got {type(model_dict)}.") + + for model_name, model in model_dict.items(): + # Perform a sanity check. if not hasattr(model, "sample"): - raise NotImplementedError(f"Model named {key} must implement a sample function.") - self[key] = model + raise AttributeError(f"The model {model_name} does not have a sample(...) function.") + + # Store the model name for quick access in sample(...) function. + self._model_dict[model_name] = model + + # Add the models to the response model. + self.add_attrs(model_dict) def sample(self, state, action, next_state, **kwargs): """sample(self, state, action, next_state) @@ -212,12 +243,17 @@ cdef class ResponseModel(dict): Returns: Response: the response """ - return Response( - dict([ - (name, model.sample(state, action, next_state, **kwargs)) - for name, model in self.items() - ]) - ) + return self.create_response(**dict([ + (name, model.sample(state, action, next_state, **kwargs)) + for name, model in self._model_dict.items() + ])) + + def create_response(self, *args, **kwargs): + return self._response.new(*args, **kwargs) + + # @property + # def response_type(self): + # return type(self._response_type) cdef class BlackboxModel: """ @@ -371,31 +407,48 @@ cdef class Vector(list): raise TypeError(f"other must be type Vector, float, or int, but got {type(other)}.") return Vector([v0 + v1 for v0, v1 in zip(self, vec)]) + def __radd__(self, other): + return self.__add__(other) + def __mul__(self, other): if not isinstance(other, (float, int)): raise TypeError(f"other must be type float or int, but got {type(other)}.") return Vector([v * other for v in self]) -cdef class Response(dict): + def __rmul__(self, other): + return self.__mul__(other) + + +cdef class Response: """ - The Response class. + A Response class that only handles a scalar reward. Subclasses of Response can add + more (scalar or vector) variables. But the subclasses must implement how to handle + arithmetic and comparison operations. """ - def __init__(self, variables=dict(reward=0.0)): + def __init__(self, reward=0.0): super().__init__() - if not isinstance(variables, dict): - raise TypeError(f"reward must be type dict, but got {type(variables)}.") - for k, v in variables.items(): - self[k] = v + self._reward = reward + + @property + def reward(self): + return self._reward + + @classmethod + def new(cls, reward=0.0): + return cls(reward=reward) + + def _check_reward_compatibility(self, value): + if not isinstance(value, (float, int, Response)): + raise TypeError(f"other must be type Response, float, or int, but got {type(value)}.") + + def _get_value(self, value): + self._check_reward_compatibility(value) + if isinstance(value, Response): + value = value.reward + return value def __add__(self, other): - if not isinstance(other, Response): - raise TypeError("other must be type Response.") - return Response( - dict([ - (name, value + other[name]) - for name, value in self.items() - ]) - ) + return Response(self._reward + self._get_value(other)) def __radd__(self, other): return self.__add__(other) @@ -403,19 +456,31 @@ cdef class Response(dict): def __mul__(self, other): if not isinstance(other, (float, int)): raise TypeError("other must be type float or int.") - return Response( - dict([ - (name, value * other) - for name, value in self.items() - ]) - ) + return Response(self._reward * other) def __rmul__(self, other): return self.__mul__(other) - def __str__(self): - return ", ".join([f"{k}={v}" for k, v in self.items()]) + def __eq__(self, other): + return self._reward == self._get_value(other) + def __ne__(self, other): + return self._reward != self._get_value(other) + + def __lt__(self, other): + return self._reward < self._get_value(other) + + def __le__(self, other): + return self._reward <= self._get_value(other) + + def __gt__(self, other): + return self._reward > self._get_value(other) + + def __ge__(self, other): + return self._reward >= self._get_value(other) + + def __str__(self): + return f"reward={self._reward}" cdef class Agent: """ An Agent operates in an environment by taking actions, receiving diff --git a/pomdp_py/problems/load_unload/load_unload.py b/pomdp_py/problems/load_unload/load_unload.py index a0071de9..05c73823 100644 --- a/pomdp_py/problems/load_unload/load_unload.py +++ b/pomdp_py/problems/load_unload/load_unload.py @@ -215,15 +215,18 @@ def get_all_actions(self, **kwargs): class LoadUnloadProblem(pomdp_py.POMDP): def __init__(self, init_state, init_belief): """init_belief is a Distribution.""" + import copy + + response_model = pomdp_py.ResponseModel.generate_response_model({"reward": LURewardModel()}) agent = pomdp_py.Agent( init_belief, LUPolicyModel(), LUTransitionModel(), LUObservationModel(), - pomdp_py.ResponseModel({"reward": LURewardModel()}), + copy.deepcopy(response_model), ) - env = pomdp_py.Environment(init_state, LUTransitionModel(), pomdp_py.ResponseModel({"reward": LURewardModel()})) + env = pomdp_py.Environment(init_state, LUTransitionModel(), copy.deepcopy(response_model)) super().__init__(agent, env, name="LoadUnloadProblem") @@ -268,7 +271,7 @@ def update(t): action = planner.plan(load_unload_problem.agent) env_response = load_unload_problem.env.state_transition(action, execute=True) - env_reward = env_response["reward"] + env_reward = env_response.reward true_state = copy.deepcopy(load_unload_problem.env.state) real_observation = load_unload_problem.env.provide_observation( diff --git a/pomdp_py/problems/multi_object_search/agent/agent.py b/pomdp_py/problems/multi_object_search/agent/agent.py index 0082cd1d..f2da2d3a 100644 --- a/pomdp_py/problems/multi_object_search/agent/agent.py +++ b/pomdp_py/problems/multi_object_search/agent/agent.py @@ -60,7 +60,7 @@ def __init__( policy_model, transition_model=transition_model, observation_model=observation_model, - response_model=pomdp_py.ResponseModel({"reward": reward_model}), + response_model=pomdp_py.ResponseModel.generate_response_model(dict(reward=reward_model)), ) def clear_history(self): diff --git a/pomdp_py/problems/multi_object_search/env/env.py b/pomdp_py/problems/multi_object_search/env/env.py index 3b1ed7e7..04ce9563 100644 --- a/pomdp_py/problems/multi_object_search/env/env.py +++ b/pomdp_py/problems/multi_object_search/env/env.py @@ -33,7 +33,7 @@ def __init__(self, dim, init_state, sensors, obstacles=set({})): if not isinstance(init_state.object_states[objid], RobotState) } reward_model = GoalRewardModel(self.target_objects) - super().__init__(init_state, transition_model, pomdp_py.ResponseModel({"reward": reward_model})) + super().__init__(init_state, transition_model, pomdp_py.ResponseModel.generate_response_model(dict(reward=reward_model))) @property def robot_ids(self): @@ -69,7 +69,7 @@ def state_transition(self, action, execute=True, robot_id=None): response = self.response_model.sample( self.state, action, next_state, robot_id=robot_id ) - reward = response["reward"] + reward = response.reward if execute: self.apply_transition(next_state) return reward diff --git a/pomdp_py/problems/rocksample/rocksample_problem.py b/pomdp_py/problems/rocksample/rocksample_problem.py index fd62d064..5485c3a5 100644 --- a/pomdp_py/problems/rocksample/rocksample_problem.py +++ b/pomdp_py/problems/rocksample/rocksample_problem.py @@ -434,17 +434,18 @@ def __init__( self, n, k, init_state, rock_locs, init_belief, half_efficiency_dist=20 ): self._n, self._k = n, k + reponse_model = pomdp_py.ResponseModel.generate_response_model(dict(reward=RSRewardModel(rock_locs, self.in_exit_area))) agent = pomdp_py.Agent( init_belief, RSPolicyModel(n, k), RSTransitionModel(n, rock_locs, self.in_exit_area), RSObservationModel(rock_locs, half_efficiency_dist=half_efficiency_dist), - pomdp_py.ResponseModel({"reward": RSRewardModel(rock_locs, self.in_exit_area)}), + copy.deepcopy(reponse_model), ) env = pomdp_py.Environment( init_state, RSTransitionModel(n, rock_locs, self.in_exit_area), - pomdp_py.ResponseModel({"reward": RSRewardModel(rock_locs, self.in_exit_area)}), + copy.deepcopy(reponse_model), ) self._rock_locs = rock_locs super().__init__(agent, env, name="RockSampleProblem") @@ -469,20 +470,20 @@ def test_planner(rocksample, planner, nsteps=3, discount=0.95): ) rocksample.agent.update_history(action, real_observation) planner.update(rocksample.agent, action, real_observation) - total_reward += env_response["reward"] - total_discounted_reward += env_response["reward"] * gamma + total_reward += env_response.reward + total_discounted_reward += env_response.reward * gamma gamma *= discount print("True state: %s" % true_state) print("Action: %s" % str(action)) print("Observation: %s" % str(real_observation)) - print("Reward: %s" % str(env_response["reward"])) + print("Reward: %s" % str(env_response.reward)) print("Reward (Cumulative): %s" % str(total_reward)) print("Reward (Cumulative Discounted): %s" % str(total_discounted_reward)) if isinstance(planner, pomdp_py.POUCT): print("__num_sims__: %d" % planner.last_num_sims) print("__plan_time__: %.5f" % planner.last_planning_time) if isinstance(planner, pomdp_py.PORollout): - print("__best_reward__: %d" % planner.last_best_response["reward"]) + print("__best_reward__: %d" % planner.last_best_response.reward) print("World:") rocksample.print_state() diff --git a/pomdp_py/problems/tag/agent/agent.py b/pomdp_py/problems/tag/agent/agent.py index 0932f092..47fbce70 100644 --- a/pomdp_py/problems/tag/agent/agent.py +++ b/pomdp_py/problems/tag/agent/agent.py @@ -118,7 +118,7 @@ def __init__(self, init_belief, grid_map, pr_stay=0.2, small=1, big=10): policy_model, transition_model=transition_model, observation_model=observation_model, - response_model=pomdp_py.ResponseModel({"reward": reward_model}), + response_model=pomdp_py.ResponseModel.generate_response_model({"reward": reward_model}), ) def clear_history(self): diff --git a/pomdp_py/problems/tag/env/env.py b/pomdp_py/problems/tag/env/env.py index c5462e0e..f6a69e0b 100644 --- a/pomdp_py/problems/tag/env/env.py +++ b/pomdp_py/problems/tag/env/env.py @@ -14,7 +14,7 @@ def __init__(self, init_state, grid_map, pr_stay=0.2, small=1, big=10): target_motion_policy = TagTargetMotionPolicy(grid_map, pr_stay) transition_model = TagTransitionModel(grid_map, target_motion_policy) reward_model = TagRewardModel(small=small, big=big) - super().__init__(init_state, transition_model, pomdp_py.ResponseModel({"reward": reward_model})) + super().__init__(init_state, transition_model, pomdp_py.ResponseModel.generate_response_model({"reward": reward_model})) @property def width(self): diff --git a/pomdp_py/problems/tag/problem.py b/pomdp_py/problems/tag/problem.py index b6660dc2..9172ffba 100644 --- a/pomdp_py/problems/tag/problem.py +++ b/pomdp_py/problems/tag/problem.py @@ -104,13 +104,13 @@ def solve( _time_used += time.time() - _start # Info and render - _total_reward += response["reward"] - _total_discounted_reward += response["reward"] * _discount + _total_reward += response.reward + _total_discounted_reward += response.reward * _discount _discount = _discount * discount_factor print("==== Step %d ====" % (i + 1)) print("Action: %s" % str(real_action)) print("Observation: %s" % str(real_observation)) - print("Reward: %s" % str(response["reward"])) + print("Reward: %s" % str(response.reward)) print("Reward (Cumulative): %s" % str(_total_reward)) print("Reward (Discounted): %s" % str(_total_discounted_reward)) print("Find Actions Count: %d" % _find_actions_count) diff --git a/pomdp_py/problems/tiger/tiger_problem.py b/pomdp_py/problems/tiger/tiger_problem.py index 965e6344..777529f4 100644 --- a/pomdp_py/problems/tiger/tiger_problem.py +++ b/pomdp_py/problems/tiger/tiger_problem.py @@ -212,14 +212,15 @@ class TigerProblem(pomdp_py.POMDP): def __init__(self, obs_noise, init_true_state, init_belief): """init_belief is a Distribution.""" + response_model = pomdp_py.ResponseModel.generate_response_model(dict(reward=RewardModel())) agent = pomdp_py.Agent( init_belief, PolicyModel(), TransitionModel(), ObservationModel(obs_noise), - pomdp_py.ResponseModel({"reward": RewardModel()}), + copy.deepcopy(response_model), ) - env = pomdp_py.Environment(init_true_state, TransitionModel(), pomdp_py.ResponseModel({"reward": RewardModel()})) + env = pomdp_py.Environment(init_true_state, TransitionModel(), copy.deepcopy(response_model)) super().__init__(agent, env, name="TigerProblem") @staticmethod @@ -273,10 +274,10 @@ def test_planner(tiger_problem, planner, nsteps=3, debug_tree=False): # in real world); In that case, you could skip # the state transition and re-estimate the state # (e.g. through the perception stack on the robot). - reward = tiger_problem.env.response_model["reward"].sample( + response = tiger_problem.env.response_model.sample( tiger_problem.env.state, action, None ) - print("Reward:", reward) + print("Reward:", response.reward) # Let's create some simulated real observation; # Here, we use observation based on true state for sanity diff --git a/tests/test_response.py b/tests/test_response.py index f34da7c3..fe4a5f1f 100644 --- a/tests/test_response.py +++ b/tests/test_response.py @@ -1,33 +1,30 @@ -from pomdp_py.framework.basics import Response, Vector +from pomdp_py.framework.basics import Response description = "testing framework basics response" def test_assign(): r = Response() - assert r["reward"] == 0.0 + assert r.reward == 0.0 - r = Response({"reward": 34.0, "cost": Vector([12.0, 53.0])}) - assert r["reward"] == 34.0 - assert r["cost"] == [12.0, 53.0] + r = Response(34.0) + assert r.reward == 34.0 def test_add(): r = Response() - r = r + Response({"reward": 42.0}) - assert r["reward"] == 42.0 + r = r + Response(42.0) + assert r.reward == 42.0 - r = Response({"reward": 42.0, "cost": Vector([4.0, 9.0])}) - r = r + Response({"reward": 2.0, "cost": Vector([1.0, 2.0])}) - assert r["reward"] == 44.0 - assert r["cost"] == Vector([5.0, 11.0]) + r = Response() + r = r + 61.0 + assert r.reward == 61.0 def test_multiply(): - r = Response({"reward": 1.0, "cost": Vector([3.5, 6.2, 9.1])}) + r = Response(1.0) r = r * 1000.0 - assert r["reward"] == 1000.0 - assert r["cost"] == [3500.0, 6200.0, 9100.0] + assert r.reward == 1000.0 def run(): From 95efc4743e2f5f93e704f925b0d195939abc411c Mon Sep 17 00:00:00 2001 From: Troi Williams <40696868+troiwill@users.noreply.github.com> Date: Mon, 25 Mar 2024 00:05:08 -0400 Subject: [PATCH 4/4] Added comments to the code. --- pomdp_py/framework/basics.pyx | 36 +++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/pomdp_py/framework/basics.pyx b/pomdp_py/framework/basics.pyx index c4cb0d62..7da4af41 100644 --- a/pomdp_py/framework/basics.pyx +++ b/pomdp_py/framework/basics.pyx @@ -197,6 +197,17 @@ cdef class ResponseModel: @staticmethod def generate_response_model(model_dict, response=Response()): + """ + Generate a response model based on a dictionary of model attributes. This is a + convenience method to make it easier to build a Response model. + + Args: + model_dict (dict): A dictionary of models in the form {model_type: model} (e.g., {reward: reward_model}) + response (Response): A response that will be used to generate new responses. + + Returns: + The response model. + """ # Do a sanity check to ensure the response model and response are compatible. for name in model_dict.keys(): if not hasattr(response, name): @@ -208,6 +219,12 @@ cdef class ResponseModel: return model def add_attrs(self, attr_dict): + """ + Adds attributes to this object dynamically. + + Args: + attr_dict: A dictionary of attribute names and values. + """ if not isinstance(attr_dict, dict): raise TypeError(f"attr_dict must be type dict, but got {type(attr_dict)}.") @@ -217,6 +234,12 @@ cdef class ResponseModel: setattr(self, ak, None) def add_models(self, model_dict): + """ + Add models to the response. + + Args: + model_dict: A dictionary of models in the form {model_type: model} (e.g., {reward: reward_model}). + """ if not isinstance(model_dict, dict): raise TypeError(f"model_dict must be type dict, but got {type(model_dict)}.") @@ -249,12 +272,14 @@ cdef class ResponseModel: ])) def create_response(self, *args, **kwargs): + """ + Create a response with the given arguments. + + Returns: + An instance of : class : ` Response ` with the given parameters. + """ return self._response.new(*args, **kwargs) - # @property - # def response_type(self): - # return type(self._response_type) - cdef class BlackboxModel: """ A BlackboxModel is the generative distribution :math:`G(s,a)` @@ -387,6 +412,9 @@ cdef class Observation: return not self.__eq__(other) cdef class Vector(list): + """ + The Vector class. Provides an implementation of a vector for multi-valued response models. + """ def __init__(self, values=list()): if not isinstance(values, list): raise TypeError(f"values must be type list, but got {type(values)}.")