Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluating performance of contextual bandit agents in examples #314

Open
TMorville opened this issue Sep 2, 2020 · 2 comments
Open

Evaluating performance of contextual bandit agents in examples #314

TMorville opened this issue Sep 2, 2020 · 2 comments
Assignees

Comments

@TMorville
Copy link
Contributor

I have been playing around with the DCBTrainer and found some potential inconsistencies.

  1. StatlogData example found here
from genrl.utils import StatlogDataBandit

bandit = StatlogDataBandit(download=True)
context = bandit.reset()

from genrl.agents import NeuralLinearPosteriorAgent

agent = NeuralLinearPosteriorAgent(bandit)
context = bandit.reset()

action = agent.select_action(context)
new_context, reward = bandit.step(action)

from genrl.trainers import DCBTrainer

trainer = DCBTrainer(agent, bandit)
trainer.train(timesteps=1000, batch_size=32)

and code to evaluate

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

def _evaluate(trainer, bandit):
        
    y_true = bandit.df.iloc[:, -1].to_numpy()
    
    class_distribution = bandit.df.iloc[:, -1].value_counts()
    most_freq_class = class_distribution.idxmax()
    baseline_accuracy = accuracy_score(y_true, np.resize(most_freq_class, len(bandit.df))).round(2)
    
    tensor_matrix = torch.stack([torch.LongTensor(x).float() for x in bandit.df.iloc[:, :-1].to_numpy()])    
    
    y_pred = []
    
    for i in tensor_matrix:
        y_pred.append(trainer.agent.select_action(i).item()) 
    
    print("Baseline accuracy score: {}%".format(baseline_accuracy))
    print("After {} steps accuracy is {}%".format(agent.t, accuracy_score(y_true, y_pred).round(2)))
    print("Classification report")    
    print(classification_report(y_true, y_pred))
    
    fig = plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.plot(bandit.cum_reward_hist)
    plt.title("Cumulative reward")
    plt.subplot(122)
    plt.plot(bandit.cum_regret_hist)
    plt.title("Cumulative regret")
    plt.tight_layout()
    
    return y_true, y_pred

yt, yp = _evaluate(trainer, bandit)

Baseline accuracy score: 0.78%
After 44501 steps accuracy is 0.78%
Classification report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.78      0.99      0.88     34108
           2       0.00      0.00      0.00        37
           3       0.00      0.00      0.00       132
           4       0.14      0.00      0.00      6748
           5       0.17      0.00      0.00      2458
           6       0.00      0.00      0.00         6
           7       0.00      0.00      0.00        11

    accuracy                           0.78     43500
   macro avg       0.14      0.12      0.11     43500
weighted avg       0.65      0.78      0.69     43500
  1. WineDataBandit example found here

Define the bandit

from typing import Tuple

import pandas as pd
import torch

from genrl.utils.data_bandits.base import DataBasedBandit
from genrl.utils.data_bandits.utils import download_data


URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

class WineDataBandit(DataBasedBandit):
    def __init__(self, **kwargs):
        super(WineDataBandit, self).__init__(**kwargs)

        path = kwargs.get("path", "./data/Wine/")
        download = kwargs.get("download", None)
        force_download = kwargs.get("force_download", None)
        url = kwargs.get("url", URL)

        if download:
            path = download_data(path, url, force_download)

        self._df = pd.read_csv(path, header=None)
        self.n_actions = len(self._df[0].unique())
        self.context_dim = self._df.shape[1] - 1
        self.len = len(self._df)
        
    def reset(self) -> torch.Tensor:
        self._reset()
        self.df = self._df.sample(frac=1).reset_index(drop=True)
        return self._get_context()

    def _compute_reward(self, action: int) -> Tuple[int, int]:
        label = self._df.iloc[self.idx, 0]
        r = int(label == (action + 1))
        return r, 1

    def _get_context(self) -> torch.Tensor:
        return torch.tensor(
            self._df.iloc[self.idx, 1:].values,
            device=self.device,
            dtype=torch.float,
        )

training

bandit = WineDataBandit(path='/path/to/data')

from genrl.agents import NeuralLinearPosteriorAgent

agent = NeuralLinearPosteriorAgent(bandit)
context = bandit.reset()

action = agent.select_action(context)
new_context, reward = bandit.step(action)

from genrl.trainers import DCBTrainer

trainer = DCBTrainer(agent, bandit)
trainer.train(timesteps=5000, batch_size=32)

and evaluation

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

def _evaluate(df, trainer, bandit):
        
    y_true = bandit.df.iloc[:, 0].to_numpy()
    
    class_distribution = bandit.df.iloc[:, 0].value_counts()
    most_freq_class = class_distribution.idxmax()
    baseline_accuracy = accuracy_score(y_true, np.resize(most_freq_class, len(bandit.df))).round(2)
    
    tensor_matrix = torch.stack([torch.LongTensor(x).float() for x in bandit.df.iloc[:, 1:].to_numpy()])
    
    y_pred = []
    
    for i in tensor_matrix:
        y_pred.append(trainer.agent.select_action(i).item()) 
    
    print("Baseline accuracy score: {}%".format(baseline_accuracy))
    print("After {} steps accuracy is {}%".format(agent.t, accuracy_score(y_true, y_pred).round(2)))
    print("Classification report")    
    print(classification_report(y_true, y_pred))
    
    fig = plt.figure(figsize=(10, 4))
    plt.subplot(121)
    plt.plot(bandit.cum_reward_hist)
    plt.title("Cumulative reward")
    plt.subplot(122)
    plt.plot(bandit.cum_regret_hist)
    plt.title("Cumulative regret")
    plt.tight_layout()
    
    return y_true, y_pred

yt, yp = _evaluate(trainer, bandit)

Baseline accuracy score: 0.4%
After 5357 steps accuracy is 0.0%
Classification report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00      59.0
           2       0.00      0.00      0.00      71.0
           3       0.00      0.00      0.00      48.0

    accuracy                           0.00     178.0
   macro avg       0.00      0.00      0.00     178.0
weighted avg       0.00      0.00      0.00     178.0

For both cases (and the third titanic case referenced in #301 ) when training both reward and regret increases which could point to no actual learning happening and the increase in reward comes purely from randomly guessing instead of learning.

Notice that for the Statlog data, the evaluation col is the last column, while in the wine data, its the first column.

@TMorville TMorville changed the title Any real learning happening for DCBTrainer? Evaluating performance of agents in examples Sep 2, 2020
@threewisemonkeys-as threewisemonkeys-as self-assigned this Sep 2, 2020
@TMorville TMorville changed the title Evaluating performance of agents in examples Evaluating performance of contextual bandit agents in examples Sep 3, 2020
@sampreet-arthi
Copy link
Member

Is this still an issue?

@TMorville
Copy link
Contributor Author

It is. However, I think my baseline might be wrong.

I think the relevant baseline to compare performance to should be a bayesian regression trained directly on the data, instead of the output from the neural network. Do you agree?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants