Skip to content

Commit

Permalink
update python scripts adapt to new libgosdt python library
Browse files Browse the repository at this point in the history
Signed-off-by: Reto Achermann <[email protected]>
  • Loading branch information
achreto committed Feb 25, 2022
1 parent d70a4cb commit 91d6634
Show file tree
Hide file tree
Showing 18 changed files with 1,415 additions and 208 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ gosdt_test
.idea*
.vscode*
autom4te.cache*
gosdt.dSYM*
gosdt.dSYM*
results
168 changes: 92 additions & 76 deletions python/collectors/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import time
import random
import sys
import os
import os

from math import ceil
from math import ceil, log
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from time import sleep
Expand All @@ -16,6 +16,7 @@
from model.osdt import OSDT
from model.encoder import Encoder
from model.corels import CORELS
from dl85 import DL85Classifier

def file_exists(path):
exists = False
Expand Down Expand Up @@ -45,77 +46,92 @@ def benchmark(datasets, algorithms):
if not file_exists(dataset_file_path):
exit(1)
for algorithm in algorithms:
for regularization in configuration["regularization"]:
trial_path = "results/benchmark/trials_{}_{}_{}".format(dataset,algorithm,regularization)
result_file_path = "{}.csv".format(trial_path)
temp_file_path = "{}.tmp".format(trial_path)

if file_exists(result_file_path):
continue # This set of trials is already complete

temp_file = open(temp_file_path, "w")
temp_file.write("algorithm,regularization,fold,train,test,depth,leaves,nodes,time\n")

dataframe = pd.DataFrame(pd.read_csv(dataset_file_path)).dropna()
X = pd.DataFrame(dataframe.iloc[:,:-1], columns=dataframe.columns[:-1])
y = pd.DataFrame(dataframe.iloc[:,-1], columns=dataframe.columns[-1:])
(n, m) = X.shape

def trial(generator):
kfolds = KFold(n_splits=configuration["trials"], random_state=0)
fold = 0
for train_index, test_index in kfolds.split(X):
X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]
try:
model = generator()
model.fit(X_train, y_train)
except Exception as e:
print(str(e))
default_row = [algorithm,regularization,fold,0,0,0,0,0,0]
temp_file.write(",".join(str(e) for e in default_row) + "\n")
print(default_row)
print("The following data subset caused an error: error.csv")
print(X_train, y_train)
X_train.insert(m, "class", y_train) # It is expected that the last column is the label column
output = pd.DataFrame(X_train)
output.to_csv("error.csv", index=False)
exit(1)
else:
train = model.error(X_train, y_train)
test = model.error(X_test, y_test)
row = [
algorithm,regularization,fold,
train,test,
model.max_depth(), model.leaves(), model.nodes(),
model.time
]
print(row)
temp_file.write(",".join(str(e) for e in row) + "\n")
fold += 1

if algorithm == "dl85":
trial(lambda : DL85(regularization=regularization, time_limit=configuration["time_limit"], preprocessor="complete"))

elif algorithm == "osdt":
config = {
"regularization": regularization,
"time_limit": configuration["time_limit"]
}
trial(lambda : OSDT(config, preprocessor="complete"))

elif algorithm == "gosdt":
config = {
"regularization": regularization,
"similar_support": False,
"strong_indifference": False,
"time_limit": configuration["time_limit"]
}
trial(lambda : GOSDT(config))

elif algorithm == "corels":
trial(lambda : CORELS(regularization=regularization))

temp_file.close() # temp file is complete
os.rename(temp_file_path, result_file_path) # commit this file
print("Trials Completed:", result_file_path)
for depth_budget in configuration["depth_budget"]:
for regularization in configuration["regularization"]:
trial_path = "results/benchmark/trials_{}_{}_{}_{}".format(dataset,algorithm,regularization, depth_budget)
result_file_path = "{}.csv".format(trial_path)
temp_file_path = "{}.tmp".format(trial_path)

if file_exists(result_file_path):
continue # This set of trials is already complete

temp_file = open(temp_file_path, "w")
temp_file.write("algorithm,regularization,fold,train,test,depth,leaves,nodes,time\n")

dataframe = pd.DataFrame(pd.read_csv(dataset_file_path)).dropna()
X = pd.DataFrame(dataframe.iloc[:,:-1], columns=dataframe.columns[:-1])
y = pd.DataFrame(dataframe.iloc[:,-1], columns=dataframe.columns[-1:])
(n, m) = X.shape

def trial(generator):
kfolds = KFold(n_splits=configuration["trials"], shuffle=True, random_state=0)
fold = 0
for train_index, test_index in kfolds.split(X):
X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]
try:
model = generator()
model.fit(X_train, y_train)
except Exception as e:
print(str(e))
default_row = [algorithm,regularization,fold,0,0,0,0,0,0]
temp_file.write(",".join(str(e) for e in default_row) + "\n")
print(default_row)
print("The following data subset caused an error: error.csv")
print(X_train, y_train)
X_train.insert(m, "class", y_train) # It is expected that the last column is the label column
output = pd.DataFrame(X_train)
output.to_csv("error.csv", index=False)
exit(1)
else:
train = model.error(X_train, y_train)
test = model.error(X_test, y_test)
row = [
algorithm,regularization,fold,
train,test,
model.max_depth(), model.leaves(), model.nodes(),
model.utime
]
print(row)
temp_file.write(",".join(str(e) for e in row) + "\n")
fold += 1

if algorithm == "dl85":
trial(lambda : DL85(depth=depth_budget, regularization = regularization, time_limit=configuration["time_limit"], preprocessor="complete"))

elif algorithm == "dl85_no_min_sup":
trial(lambda : DL85(depth=depth_budget, time_limit=configuration["time_limit"], preprocessor="complete"))

elif algorithm == "osdt":
config = {
"regularization": regularization,
"time_limit": configuration["time_limit"]
}
trial(lambda : OSDT(config, preprocessor="complete"))

elif algorithm == "gosdt":
config = {
"regularization": regularization,
"similar_support": False,
"strong_indifference": False,
"time_limit": configuration["time_limit"],
"depth_budget": depth_budget
}
trial(lambda : GOSDT(config))

elif algorithm == "gosdt_count_subproblems":
config = {
"regularization": regularization,
"similar_support": False,
"strong_indifference": False,
"time_limit": configuration["time_limit"],
"depth_budget": depth_budget
}
trial(lambda : GOSDT(config))

elif algorithm == "corels":
trial(lambda : CORELS(regularization=regularization))

temp_file.close() # temp file is complete
os.rename(temp_file_path, result_file_path) # commit this file
print("Trials Completed:", result_file_path)
Loading

0 comments on commit 91d6634

Please sign in to comment.