From 03cf55f55daeab350e5b29ca240a0865b43fc789 Mon Sep 17 00:00:00 2001
From: Marc Lanctot <lanctot@google.com>
Date: Wed, 15 Nov 2023 15:55:28 +0000
Subject: [PATCH] Remove Pytorch NFSP due to indications that it is buggy and
 lack of maintenance.

Resolves: #1008.
PiperOrigin-RevId: 582676107
Change-Id: I622a815880c18a535f8bec83311355f2ba025c8b
---
 open_spiel/python/CMakeLists.txt              |   1 -
 open_spiel/python/pytorch/nfsp.py             | 336 ------------------
 .../python/pytorch/nfsp_pytorch_test.py       |  92 -----
 3 files changed, 429 deletions(-)
 delete mode 100644 open_spiel/python/pytorch/nfsp.py
 delete mode 100644 open_spiel/python/pytorch/nfsp_pytorch_test.py

diff --git a/open_spiel/python/CMakeLists.txt b/open_spiel/python/CMakeLists.txt
index 85e5e65fea..74d595e687 100644
--- a/open_spiel/python/CMakeLists.txt
+++ b/open_spiel/python/CMakeLists.txt
@@ -280,7 +280,6 @@ if (OPEN_SPIEL_ENABLE_PYTORCH)
   set(PYTHON_TESTS ${PYTHON_TESTS}
       pytorch/rcfr_pytorch_test.py
       pytorch/dqn_pytorch_test.py
-      pytorch/nfsp_pytorch_test.py
       pytorch/deep_cfr_pytorch_test.py
       pytorch/eva_pytorch_test.py
       pytorch/losses/rl_losses_pytorch_test.py
diff --git a/open_spiel/python/pytorch/nfsp.py b/open_spiel/python/pytorch/nfsp.py
deleted file mode 100644
index 68490da6d7..0000000000
--- a/open_spiel/python/pytorch/nfsp.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Copyright 2019 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Neural Fictitious Self-Play (NFSP) agent implemented in PyTorch.
-
-See the paper https://arxiv.org/abs/1603.01121 for more details.
-"""
-
-import collections
-import contextlib
-import enum
-import os
-import random
-from absl import logging
-import numpy as np
-
-import torch
-import torch.nn.functional as F
-
-from open_spiel.python import rl_agent
-from open_spiel.python.pytorch import dqn
-
-
-Transition = collections.namedtuple(
-    "Transition", "info_state action_probs legal_actions_mask")
-
-MODE = enum.Enum("mode", "best_response average_policy")
-
-
-class NFSP(rl_agent.AbstractAgent):
-  """NFSP Agent implementation in PyTorch.
-
-  See open_spiel/python/examples/kuhn_nfsp.py for an usage example.
-  """
-
-  def __init__(self,
-               player_id,
-               state_representation_size,
-               num_actions,
-               hidden_layers_sizes,
-               reservoir_buffer_capacity,
-               anticipatory_param,
-               batch_size=128,
-               rl_learning_rate=0.01,
-               sl_learning_rate=0.01,
-               min_buffer_size_to_learn=1000,
-               learn_every=64,
-               optimizer_str="sgd",
-               **kwargs):
-    """Initialize the `NFSP` agent."""
-    self.player_id = player_id
-    self._num_actions = num_actions
-    self._layer_sizes = hidden_layers_sizes
-    self._batch_size = batch_size
-    self._learn_every = learn_every
-    self._anticipatory_param = anticipatory_param
-    self._min_buffer_size_to_learn = min_buffer_size_to_learn
-
-    self._reservoir_buffer = ReservoirBuffer(reservoir_buffer_capacity)
-    self._prev_timestep = None
-    self._prev_action = None
-
-    # Step counter to keep track of learning.
-    self._step_counter = 0
-
-    # Inner RL agent
-    kwargs.update({
-        "batch_size": batch_size,
-        "learning_rate": rl_learning_rate,
-        "learn_every": learn_every,
-        "min_buffer_size_to_learn": min_buffer_size_to_learn,
-        "optimizer_str": optimizer_str,
-    })
-    self._rl_agent = dqn.DQN(player_id, state_representation_size,
-                             num_actions, hidden_layers_sizes, **kwargs)
-
-    # Keep track of the last training loss achieved in an update step.
-    self._last_rl_loss_value = lambda: self._rl_agent.loss
-    self._last_sl_loss_value = None
-
-    # Average policy network.
-    self._avg_network = dqn.MLP(state_representation_size,
-                                self._layer_sizes, num_actions)
-
-    self._savers = [
-        ("q_network", self._rl_agent._q_network),
-        ("avg_network", self._avg_network)
-    ]
-
-    if optimizer_str == "adam":
-      self.optimizer = torch.optim.Adam(
-          self._avg_network.parameters(), lr=sl_learning_rate)
-    elif optimizer_str == "sgd":
-      self.optimizer = torch.optim.SGD(
-          self._avg_network.parameters(), lr=sl_learning_rate)
-    else:
-      raise ValueError("Not implemented. Choose from ['adam', 'sgd'].")
-
-    self._sample_episode_policy()
-
-  @contextlib.contextmanager
-  def temp_mode_as(self, mode):
-    """Context manager to temporarily overwrite the mode."""
-    previous_mode = self._mode
-    self._mode = mode
-    yield
-    self._mode = previous_mode
-
-  def _sample_episode_policy(self):
-    if np.random.rand() < self._anticipatory_param:
-      self._mode = MODE.best_response
-    else:
-      self._mode = MODE.average_policy
-
-  def _act(self, info_state, legal_actions):
-    info_state = np.reshape(info_state, [1, -1])
-    action_values = self._avg_network(torch.Tensor(info_state))
-    action_probs = F.softmax(action_values, dim=1).detach()
-
-    self._last_action_values = action_values[0]
-    # Remove illegal actions, normalize probs
-    probs = np.zeros(self._num_actions)
-    probs[legal_actions] = action_probs[0][legal_actions]
-    probs /= sum(probs)
-    action = np.random.choice(len(probs), p=probs)
-    return action, probs
-
-  @property
-  def mode(self):
-    return self._mode
-
-  @property
-  def loss(self):
-    return (self._last_sl_loss_value, self._last_rl_loss_value().detach())
-
-  def step(self, time_step, is_evaluation=False):
-    """Returns the action to be taken and updates the Q-networks if needed.
-
-    Args:
-      time_step: an instance of rl_environment.TimeStep.
-      is_evaluation: bool, whether this is a training or evaluation call.
-
-    Returns:
-      A `rl_agent.StepOutput` containing the action probs and chosen action.
-    """
-    if self._mode == MODE.best_response:
-      agent_output = self._rl_agent.step(time_step, is_evaluation)
-      if not is_evaluation and not time_step.last():
-        self._add_transition(time_step, agent_output)
-
-    elif self._mode == MODE.average_policy:
-      # Act step: don't act at terminal info states.
-      if not time_step.last():
-        info_state = time_step.observations["info_state"][self.player_id]
-        legal_actions = time_step.observations["legal_actions"][self.player_id]
-        action, probs = self._act(info_state, legal_actions)
-        agent_output = rl_agent.StepOutput(action=action, probs=probs)
-
-      if self._prev_timestep and not is_evaluation:
-        self._rl_agent.add_transition(self._prev_timestep, self._prev_action,
-                                      time_step)
-    else:
-      raise ValueError("Invalid mode ({})".format(self._mode))
-
-    if not is_evaluation:
-      self._step_counter += 1
-
-      if self._step_counter % self._learn_every == 0:
-        self._last_sl_loss_value = self._learn()
-        # If learn step not triggered by rl policy, learn.
-        if self._mode == MODE.average_policy:
-          self._rl_agent.learn()
-
-      # Prepare for the next episode.
-      if time_step.last():
-        self._sample_episode_policy()
-        self._prev_timestep = None
-        self._prev_action = None
-        return
-      else:
-        self._prev_timestep = time_step
-        self._prev_action = agent_output.action
-
-    return agent_output
-
-  def _add_transition(self, time_step, agent_output):
-    """Adds the new transition using `time_step` to the reservoir buffer.
-
-    Transitions are in the form (time_step, agent_output.probs, legal_mask).
-
-    Args:
-      time_step: an instance of rl_environment.TimeStep.
-      agent_output: an instance of rl_agent.StepOutput.
-    """
-    legal_actions = time_step.observations["legal_actions"][self.player_id]
-    legal_actions_mask = np.zeros(self._num_actions)
-    legal_actions_mask[legal_actions] = 1.0
-    transition = Transition(
-        info_state=(time_step.observations["info_state"][self.player_id][:]),
-        action_probs=agent_output.probs,
-        legal_actions_mask=legal_actions_mask)
-    self._reservoir_buffer.add(transition)
-
-  def _learn(self):
-    """Compute the loss on sampled transitions and perform a avg-network update.
-
-    If there are not enough elements in the buffer, no loss is computed and
-    `None` is returned instead.
-
-    Returns:
-      The average loss obtained on this batch of transitions or `None`.
-    """
-    if (len(self._reservoir_buffer) < self._batch_size or
-        len(self._reservoir_buffer) < self._min_buffer_size_to_learn):
-      return None
-
-    transitions = self._reservoir_buffer.sample(self._batch_size)
-    info_states = torch.Tensor([t.info_state for t in transitions])
-    action_probs = torch.Tensor([t.action_probs for t in transitions])
-
-    self.optimizer.zero_grad()
-    loss = F.cross_entropy(self._avg_network(info_states),
-                           torch.max(action_probs, dim=1)[1])
-    loss.backward()
-    self.optimizer.step()
-    return loss.detach()
-
-  def _full_checkpoint_name(self, checkpoint_dir, name):
-    checkpoint_filename = "_".join([name, "pid" + str(self.player_id)])
-    return os.path.join(checkpoint_dir, checkpoint_filename)
-
-  def _latest_checkpoint_filename(self, name):
-    checkpoint_filename = "_".join([name, "pid" + str(self.player_id)])
-    return checkpoint_filename + "_latest"
-
-  def save(self, checkpoint_dir):
-    """Saves the average policy network and the inner RL agent's q-network.
-
-    Note that this does not save the experience replay buffers and should
-    only be used to restore the agent's policy, not resume training.
-
-    Args:
-      checkpoint_dir: directory where checkpoints will be saved.
-    """
-    for name, model in self._savers:
-      path = self._full_checkpoint_name(checkpoint_dir, name)
-      torch.save(model.state_dict(), path)
-      logging.info("Saved to path: %s", path)
-
-  def has_checkpoint(self, checkpoint_dir):
-    for name, _ in self._savers:
-      path = self._full_checkpoint_name(checkpoint_dir, name)
-      if os.path.exists(path):
-        return True
-    return False
-
-  def restore(self, checkpoint_dir):
-    """Restores the average policy network and the inner RL agent's q-network.
-
-    Note that this does not restore the experience replay buffers and should
-    only be used to restore the agent's policy, not resume training.
-
-    Args:
-      checkpoint_dir: directory from which checkpoints will be restored.
-    """
-    for name, model in self._savers:
-      full_checkpoint_dir = self._full_checkpoint_name(checkpoint_dir, name)
-      logging.info("Restoring checkpoint: %s", full_checkpoint_dir)
-      model.load_state_dict(torch.load(full_checkpoint_dir))
-
-
-class ReservoirBuffer(object):
-  """Allows uniform sampling over a stream of data.
-
-  This class supports the storage of arbitrary elements, such as observation
-  tensors, integer actions, etc.
-
-  See https://en.wikipedia.org/wiki/Reservoir_sampling for more details.
-  """
-
-  def __init__(self, reservoir_buffer_capacity):
-    self._reservoir_buffer_capacity = reservoir_buffer_capacity
-    self._data = []
-    self._add_calls = 0
-
-  def add(self, element):
-    """Potentially adds `element` to the reservoir buffer.
-
-    Args:
-      element: data to be added to the reservoir buffer.
-    """
-    if len(self._data) < self._reservoir_buffer_capacity:
-      self._data.append(element)
-    else:
-      idx = np.random.randint(0, self._add_calls + 1)
-      if idx < self._reservoir_buffer_capacity:
-        self._data[idx] = element
-    self._add_calls += 1
-
-  def sample(self, num_samples):
-    """Returns `num_samples` uniformly sampled from the buffer.
-
-    Args:
-      num_samples: `int`, number of samples to draw.
-
-    Returns:
-      An iterable over `num_samples` random elements of the buffer.
-
-    Raises:
-      ValueError: If there are less than `num_samples` elements in the buffer
-    """
-    if len(self._data) < num_samples:
-      raise ValueError("{} elements could not be sampled from size {}".format(
-          num_samples, len(self._data)))
-    return random.sample(self._data, num_samples)
-
-  def clear(self):
-    self._data = []
-    self._add_calls = 0
-
-  def __len__(self):
-    return len(self._data)
-
-  def __iter__(self):
-    return iter(self._data)
diff --git a/open_spiel/python/pytorch/nfsp_pytorch_test.py b/open_spiel/python/pytorch/nfsp_pytorch_test.py
deleted file mode 100644
index 3f2f3015f4..0000000000
--- a/open_spiel/python/pytorch/nfsp_pytorch_test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2019 DeepMind Technologies Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Tests for open_spiel.python.algorithms.nfsp."""
-
-import random
-from absl.testing import absltest
-import torch
-
-from open_spiel.python import rl_environment
-from open_spiel.python.pytorch import nfsp
-
-
-SEED = 24984617
-
-
-class NFSPTest(absltest.TestCase):
-
-  def test_run_kuhn(self):
-    env = rl_environment.Environment("kuhn_poker")
-    state_size = env.observation_spec()["info_state"][0]
-    num_actions = env.action_spec()["num_actions"]
-
-    agents = [
-        nfsp.NFSP(  # pylint: disable=g-complex-comprehension
-            player_id,
-            state_representation_size=state_size,
-            num_actions=num_actions,
-            hidden_layers_sizes=[16],
-            reservoir_buffer_capacity=10,
-            anticipatory_param=0.1) for player_id in [0, 1]
-    ]
-    for unused_ep in range(10):
-      time_step = env.reset()
-      while not time_step.last():
-        current_player = time_step.observations["current_player"]
-        current_agent = agents[current_player]
-        agent_output = current_agent.step(time_step)
-        time_step = env.step([agent_output.action])
-      for agent in agents:
-        agent.step(time_step)
-
-
-class ReservoirBufferTest(absltest.TestCase):
-
-  def test_reservoir_buffer_add(self):
-    reservoir_buffer = nfsp.ReservoirBuffer(reservoir_buffer_capacity=10)
-    self.assertEmpty(reservoir_buffer)
-    reservoir_buffer.add("entry1")
-    self.assertLen(reservoir_buffer, 1)
-    reservoir_buffer.add("entry2")
-    self.assertLen(reservoir_buffer, 2)
-
-    self.assertIn("entry1", reservoir_buffer)
-    self.assertIn("entry2", reservoir_buffer)
-
-  def test_reservoir_buffer_max_capacity(self):
-    reservoir_buffer = nfsp.ReservoirBuffer(reservoir_buffer_capacity=2)
-    reservoir_buffer.add("entry1")
-    reservoir_buffer.add("entry2")
-    reservoir_buffer.add("entry3")
-
-    self.assertLen(reservoir_buffer, 2)
-
-  def test_reservoir_buffer_sample(self):
-    replay_buffer = nfsp.ReservoirBuffer(reservoir_buffer_capacity=3)
-    replay_buffer.add("entry1")
-    replay_buffer.add("entry2")
-    replay_buffer.add("entry3")
-
-    samples = replay_buffer.sample(3)
-
-    self.assertIn("entry1", samples)
-    self.assertIn("entry2", samples)
-    self.assertIn("entry3", samples)
-
-
-if __name__ == "__main__":
-  random.seed(SEED)
-  torch.manual_seed(SEED)
-  absltest.main()