self_play.py

import copy
import math
import time

import numpy
import ray
import torch

import models


@ray.remote(num_cpus=1)
class SelfPlay:
    """
    Class which run in a dedicated thread to play games and save them to the replay-buffer.
    """

    def __init__(self, initial_weights, game, config):
        self.config = config
        self.game = game

        self.sum_reward = 0
        self.num_game_played = 0

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Initialize the network
        self.model = models.MuZeroNetwork(self.config)
        self.model.set_weights(initial_weights)
        self.model.to(torch.device("cpu"))
        self.model.eval()

    def continuous_self_play(self, shared_storage, replay_buffer, test_mode=False):
        while True:
            self.model.set_weights(
                copy.deepcopy(ray.get(shared_storage.get_target_network_weights.remote()))
            )

            # Take the best action (no exploration) in test mode
            temperature = (
                0
                if test_mode
                else self.config.visit_softmax_temperature_fn(
                    trained_steps=ray.get(shared_storage.get_infos.remote())[
                        "training_step"
                    ]
                )
            )
            game_history = self.play_game(
                temperature,
                self.config.temperature_threshold,
                False,
                "self" if not test_mode else "random",
                0,
            )

            self.num_game_played += 1

            # Save to the shared storage
            if test_mode:
                total_reward = sum(game_history.reward_history)
                shared_storage.set_infos.remote(
                    "total_reward", total_reward
                )
                self.sum_reward += total_reward
                shared_storage.set_infos.remote(
                    "average_reward", self.sum_reward / self.num_game_played
                )
                shared_storage.set_infos.remote(
                    "test_games", self.num_game_played
                )
                shared_storage.set_infos.remote(
                    "episode_length", len(game_history.action_history)
                )
                if 1 < len(self.config.players):
                    shared_storage.set_infos.remote(
                        "player_0_reward",
                        sum(
                            [
                                reward
                                for i, reward in enumerate(game_history.reward_history)
                                if game_history.to_play_history[i] == 1
                            ]
                        ),
                    )
                    shared_storage.set_infos.remote(
                        "player_1_reward",
                        sum(
                            [
                                reward
                                for i, reward in enumerate(game_history.reward_history)
                                if game_history.to_play_history[i] == 0
                            ]
                        ),
                    )
            if not test_mode:
                replay_buffer.save_game.remote(game_history)
                shared_storage.update_infos.remote("samples_count", len(game_history.priorities))

            # Managing the self-play / training ratio
            if not test_mode and self.config.self_play_delay:
                time.sleep(self.config.self_play_delay)
            if not test_mode and self.config.ratio:
                while (
                        ray.get(shared_storage.get_infos.remote())["samples_count"]
                        / max(
                    1, ray.get(shared_storage.get_infos.remote())["training_step"]
                )
                        > self.config.ratio
                ):
                    time.sleep(0.5)

    def play_game(
            self, temperature, temperature_threshold, render, opponent, muzero_player
    ):
        """
        Play one game with actions based on the Monte Carlo tree search at each moves.
        """
        game_history = GameHistory()
        observation = self.game.reset()
        game_history.action_history.append(0)
        game_history.observation_history.append(observation)
        game_history.reward_history.append(0)
        game_history.to_play_history.append(self.game.to_play())
        game_history.legal_actions.append(self.game.legal_actions())

        done = False

        if render:
            self.game.render()

        with torch.no_grad():
            while (
                    not done and len(game_history.action_history) <= self.config.max_moves
            ):
                stacked_observations = game_history.get_stacked_observations(
                    -1, self.config.stacked_observations,
                )

                root, priority, tree_depth = MCTS(self.config).run(
                    self.model,
                    stacked_observations,
                    self.game.legal_actions(),
                    self.game.to_play(),
                    False if temperature == 0 else True,
                )

                if render:
                    print("Tree depth: {}".format(tree_depth))
                    print(
                        "Root value for player {0}: {1:.2f}".format(
                            self.game.to_play(), root.value()
                        )
                    )

                # Choose the action
                if opponent == "self" or muzero_player == self.game.to_play():
                    action = self.select_action(
                        root,
                        temperature
                        if not temperature_threshold
                           or len(game_history.action_history) < temperature_threshold
                        else 0,
                    )
                elif opponent == "human":
                    print(
                        "Player {} turn. MuZero suggests {}".format(
                            self.game.to_play(),
                            self.game.action_to_string(self.select_action(root, 0)),
                        )
                    )
                    action = self.game.human_to_action()
                elif opponent == "random":
                    action = numpy.random.choice(self.game.legal_actions())
                else:
                    raise ValueError(
                        'Wrong argument: "opponent" argument should be "self", "human" or "random"'
                    )

                observation, reward, done = self.game.step(action)

                if render:
                    print(
                        "Played action: {}".format(self.game.action_to_string(action))
                    )
                    self.game.render()

                game_history.store_search_statistics(root, self.config.action_space)
                game_history.priorities.append(priority)

                # Next batch
                game_history.action_history.append(action)
                game_history.observation_history.append(observation)
                game_history.reward_history.append(reward)
                game_history.to_play_history.append(self.game.to_play())
                game_history.legal_actions.append(self.game.legal_actions())
        self.game.close()
        return game_history

    @staticmethod
    def select_action(node, temperature):
        """
        Select action according to the visit count distribution and the temperature.
        The temperature is changed dynamically with the visit_softmax_temperature function 
        in the config.
        """
        visit_counts = numpy.array(
            [child.visit_count for child in node.children.values()]
        )
        actions = [action for action in node.children.keys()]
        if temperature == 0:
            action = actions[numpy.argmax(visit_counts)]
        elif temperature == float("inf"):
            action = numpy.random.choice(actions)
        else:
            # See paper appendix Data Generation
            visit_count_distribution = visit_counts ** (1 / temperature)
            visit_count_distribution = visit_count_distribution / sum(
                visit_count_distribution
            )
            action = numpy.random.choice(actions, p=visit_count_distribution)

        return action


# Game independent
class MCTS:
    """
    Core Monte Carlo Tree Search algorithm.
    To decide on an action, we run N simulations, always starting at the root of
    the search tree and traversing the tree according to the UCB formula until we
    reach a leaf node.
    """

    def __init__(self, config):
        self.config = config

    def run(self, model, observation, legal_actions, to_play, add_exploration_noise):
        """
        At the root of the search tree we use the representation function to obtain a
        hidden state given the current observation.
        We then run a Monte Carlo Tree Search using only action sequences and the model
        learned by the network.
        """
        root = Node(0)
        observation = (
            torch.tensor(observation)
                .float()
                .unsqueeze(0)
                .to(next(model.parameters()).device)
        )
        (
            root_predicted_value,
            reward,
            policy_logits,
            hidden_state,
        ) = model.initial_inference(observation)
        root_predicted_value = models.support_to_scalar(
            root_predicted_value, self.config.support_size
        ).item()
        reward = models.support_to_scalar(reward, self.config.support_size).item()
        root.expand(
            legal_actions, to_play, reward, policy_logits, hidden_state,
        )
        if add_exploration_noise:
            root.add_exploration_noise(
                dirichlet_alpha=self.config.root_dirichlet_alpha,
                exploration_fraction=self.config.root_exploration_fraction,
            )

        min_max_stats = MinMaxStats()

        max_tree_depth = 0
        for _ in range(self.config.num_simulations):
            virtual_to_play = to_play
            node = root
            search_path = [node]
            current_tree_depth = 0

            while node.expanded():
                current_tree_depth += 1
                action, node = self.select_child(node, min_max_stats)
                search_path.append(node)

                # Players play turn by turn
                if virtual_to_play + 1 < len(self.config.players):
                    virtual_to_play = self.config.players[virtual_to_play + 1]
                else:
                    virtual_to_play = self.config.players[0]

            # Inside the search tree we use the dynamics function to obtain the next hidden
            # state given an action and the previous hidden state
            parent = search_path[-2]
            value, reward, policy_logits, hidden_state = model.recurrent_inference(
                parent.hidden_state,
                torch.tensor([[action]]).to(parent.hidden_state.device),
            )
            value = models.support_to_scalar(value, self.config.support_size).item()
            reward = models.support_to_scalar(reward, self.config.support_size).item()
            node.expand(
                self.config.action_space,
                virtual_to_play,
                reward,
                policy_logits,
                hidden_state,
            )

            self.backpropagate(search_path, value, virtual_to_play, min_max_stats)

            max_tree_depth = max(max_tree_depth, current_tree_depth)

        priority = (
            None
            if self.config.use_max_priority
            else numpy.abs(root_predicted_value - root.value()) ** self.config.PER_alpha
        )

        return root, priority, max_tree_depth

    def select_child(self, node, min_max_stats):
        """
        Select the child with the highest UCB score.
        """
        _, action, child = max(
            (self.ucb_score(node, child, min_max_stats), action, child)
            for action, child in node.children.items()
        )
        return action, child

    def ucb_score(self, parent, child, min_max_stats):
        """
        The score for a node is based on its value, plus an exploration bonus based on the prior.
        """
        pb_c = (
                math.log(
                    (parent.visit_count + self.config.pb_c_base + 1) / self.config.pb_c_base
                )
                + self.config.pb_c_init
        )
        pb_c *= math.sqrt(parent.visit_count) / (child.visit_count + 1)

        prior_score = pb_c * child.prior

        if child.visit_count > 0:
            # Mean value Q
            value_score = min_max_stats.normalize(
                child.reward + self.config.discount * child.value()
            )
        else:
            value_score = 0

        return prior_score + value_score

    def backpropagate(self, search_path, value, to_play, min_max_stats):
        """
        At the end of a simulation, we propagate the evaluation all the way up the tree
        to the root.
        """
        for node in reversed(search_path):
            node.value_sum += value if node.to_play == to_play else -value
            node.visit_count += 1
            min_max_stats.update(node.reward + self.config.discount * node.value())

            value = node.reward + self.config.discount * value


class Node:
    def __init__(self, prior):
        self.visit_count = 0
        self.to_play = -1
        self.prior = prior
        self.value_sum = 0
        self.children = {}
        self.hidden_state = None
        self.reward = 0

    def expanded(self):
        return len(self.children) > 0

    def value(self):
        if self.visit_count == 0:
            return 0
        return self.value_sum / self.visit_count

    def expand(self, actions, to_play, reward, policy_logits, hidden_state):
        """
        We expand a node using the value, reward and policy prediction obtained from the
        neural network.
        """
        self.to_play = to_play
        self.reward = reward
        self.hidden_state = hidden_state
        policy = {}
        for a in actions:
            try:
                policy[a] = 1 / sum(torch.exp(policy_logits[0] - policy_logits[0][a]))
            except OverflowError:
                print("Warning: prior has been approximated")
                policy[a] = 0.0
        for action, p in policy.items():
            self.children[action] = Node(p)

    def add_exploration_noise(self, dirichlet_alpha, exploration_fraction):
        """
        At the start of each search, we add dirichlet noise to the prior of the root to
        encourage the search to explore new actions.
        """
        actions = list(self.children.keys())
        noise = numpy.random.dirichlet([dirichlet_alpha] * len(actions))
        frac = exploration_fraction
        for a, n in zip(actions, noise):
            self.children[a].prior = self.children[a].prior * (1 - frac) + n * frac


class GameHistory:
    """
    Store only usefull information of a self-play game.
    """

    def __init__(self):
        self.observation_history = []
        self.action_history = []
        self.reward_history = []
        self.to_play_history = []
        self.child_visits = []
        self.root_values = []
        self.priorities = []
        self.legal_actions = []

    def store_search_statistics(self, root, action_space, idx=None):
        # Turn visit count from root into a policy
        sum_visits = sum(child.visit_count for child in root.children.values())
        if idx is None:
            self.child_visits.append(
                [
                    root.children[a].visit_count / sum_visits if a in root.children else 0
                    for a in action_space
                ]
            )

            self.root_values.append(root.value())
        else:
            self.child_visits[idx] = [
                    root.children[a].visit_count / sum_visits if a in root.children else 0
                    for a in action_space
                ]

            self.root_values[idx] = root.value()

    def get_stacked_observations(self, index, num_stacked_observations):
        """
        Generate a new observation with the observation at the index position
        and num_stacked_observations past observations and actions stacked.
        """
        # Convert to positive index
        index = index % len(self.observation_history)

        stacked_observations = self.observation_history[index].copy()
        for past_observation_index in reversed(
                range(index - num_stacked_observations, index)
        ):
            if 0 <= past_observation_index:
                previous_observation = numpy.concatenate(
                    (
                        self.observation_history[past_observation_index],
                        [
                            numpy.ones_like(stacked_observations[0])
                            * self.action_history[past_observation_index + 1]
                        ],
                    )
                )
            else:
                previous_observation = numpy.concatenate(
                    (
                        numpy.zeros_like(self.observation_history[index]),
                        [numpy.zeros_like(stacked_observations[0])],
                    )
                )

            stacked_observations = numpy.concatenate(
                (stacked_observations, previous_observation)
            )

        return stacked_observations


class MinMaxStats:
    """
    A class that holds the min-max values of the tree.
    """

    def __init__(self):
        self.maximum = -float("inf")
        self.minimum = float("inf")

    def update(self, value):
        self.maximum = max(self.maximum, value)
        self.minimum = min(self.minimum, value)

    def normalize(self, value):
        if self.maximum > self.minimum:
            # We normalize only when we have set the maximum and minimum values
            return (value - self.minimum) / (self.maximum - self.minimum)
        return value