From 91d6634652caacf0a51e4beeae903d2040def610 Mon Sep 17 00:00:00 2001 From: Reto Achermann Date: Fri, 25 Feb 2022 10:43:31 -0800 Subject: [PATCH] update python scripts adapt to new libgosdt python library Signed-off-by: Reto Achermann --- .gitignore | 3 +- python/collectors/benchmark.py | 168 ++-- python/collectors/run_single_configuration.py | 718 ++++++++++++++++++ python/example.py | 2 +- python/model/binoct.py | 10 +- python/model/cart.py | 32 +- python/model/corels.py | 6 + python/model/dl85.py | 49 +- python/model/ghoul.py | 364 +++++++++ python/model/gosdt.py | 93 ++- python/model/osdt.py | 16 +- python/model/pygosdt_lib/models/objective.py | 2 +- .../pygosdt_lib/models/osdt_classifier.py | 4 +- .../models/parallel_osdt_classifier.py | 4 +- python/model/tree_classifier.py | 10 +- python/run.py | 96 ++- python/visualizations/trace.py | 21 +- python/visualizations/tree.py | 25 +- 18 files changed, 1415 insertions(+), 208 deletions(-) create mode 100644 python/collectors/run_single_configuration.py create mode 100644 python/model/ghoul.py diff --git a/.gitignore b/.gitignore index 7ed0973..fa2e5a4 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ gosdt_test .idea* .vscode* autom4te.cache* -gosdt.dSYM* \ No newline at end of file +gosdt.dSYM* +results \ No newline at end of file diff --git a/python/collectors/benchmark.py b/python/collectors/benchmark.py index 69dd676..19fde1f 100644 --- a/python/collectors/benchmark.py +++ b/python/collectors/benchmark.py @@ -4,9 +4,9 @@ import time import random import sys -import os +import os -from math import ceil +from math import ceil, log from sklearn.model_selection import KFold from sklearn.utils import shuffle from time import sleep @@ -16,6 +16,7 @@ from model.osdt import OSDT from model.encoder import Encoder from model.corels import CORELS +from dl85 import DL85Classifier def file_exists(path): exists = False @@ -45,77 +46,92 @@ def benchmark(datasets, algorithms): if not file_exists(dataset_file_path): exit(1) for algorithm in algorithms: - for regularization in configuration["regularization"]: - trial_path = "results/benchmark/trials_{}_{}_{}".format(dataset,algorithm,regularization) - result_file_path = "{}.csv".format(trial_path) - temp_file_path = "{}.tmp".format(trial_path) - - if file_exists(result_file_path): - continue # This set of trials is already complete - - temp_file = open(temp_file_path, "w") - temp_file.write("algorithm,regularization,fold,train,test,depth,leaves,nodes,time\n") - - dataframe = pd.DataFrame(pd.read_csv(dataset_file_path)).dropna() - X = pd.DataFrame(dataframe.iloc[:,:-1], columns=dataframe.columns[:-1]) - y = pd.DataFrame(dataframe.iloc[:,-1], columns=dataframe.columns[-1:]) - (n, m) = X.shape - - def trial(generator): - kfolds = KFold(n_splits=configuration["trials"], random_state=0) - fold = 0 - for train_index, test_index in kfolds.split(X): - X_train, y_train = X.iloc[train_index], y.iloc[train_index] - X_test, y_test = X.iloc[test_index], y.iloc[test_index] - try: - model = generator() - model.fit(X_train, y_train) - except Exception as e: - print(str(e)) - default_row = [algorithm,regularization,fold,0,0,0,0,0,0] - temp_file.write(",".join(str(e) for e in default_row) + "\n") - print(default_row) - print("The following data subset caused an error: error.csv") - print(X_train, y_train) - X_train.insert(m, "class", y_train) # It is expected that the last column is the label column - output = pd.DataFrame(X_train) - output.to_csv("error.csv", index=False) - exit(1) - else: - train = model.error(X_train, y_train) - test = model.error(X_test, y_test) - row = [ - algorithm,regularization,fold, - train,test, - model.max_depth(), model.leaves(), model.nodes(), - model.time - ] - print(row) - temp_file.write(",".join(str(e) for e in row) + "\n") - fold += 1 - - if algorithm == "dl85": - trial(lambda : DL85(regularization=regularization, time_limit=configuration["time_limit"], preprocessor="complete")) - - elif algorithm == "osdt": - config = { - "regularization": regularization, - "time_limit": configuration["time_limit"] - } - trial(lambda : OSDT(config, preprocessor="complete")) - - elif algorithm == "gosdt": - config = { - "regularization": regularization, - "similar_support": False, - "strong_indifference": False, - "time_limit": configuration["time_limit"] - } - trial(lambda : GOSDT(config)) - - elif algorithm == "corels": - trial(lambda : CORELS(regularization=regularization)) - - temp_file.close() # temp file is complete - os.rename(temp_file_path, result_file_path) # commit this file - print("Trials Completed:", result_file_path) + for depth_budget in configuration["depth_budget"]: + for regularization in configuration["regularization"]: + trial_path = "results/benchmark/trials_{}_{}_{}_{}".format(dataset,algorithm,regularization, depth_budget) + result_file_path = "{}.csv".format(trial_path) + temp_file_path = "{}.tmp".format(trial_path) + + if file_exists(result_file_path): + continue # This set of trials is already complete + + temp_file = open(temp_file_path, "w") + temp_file.write("algorithm,regularization,fold,train,test,depth,leaves,nodes,time\n") + + dataframe = pd.DataFrame(pd.read_csv(dataset_file_path)).dropna() + X = pd.DataFrame(dataframe.iloc[:,:-1], columns=dataframe.columns[:-1]) + y = pd.DataFrame(dataframe.iloc[:,-1], columns=dataframe.columns[-1:]) + (n, m) = X.shape + + def trial(generator): + kfolds = KFold(n_splits=configuration["trials"], shuffle=True, random_state=0) + fold = 0 + for train_index, test_index in kfolds.split(X): + X_train, y_train = X.iloc[train_index], y.iloc[train_index] + X_test, y_test = X.iloc[test_index], y.iloc[test_index] + try: + model = generator() + model.fit(X_train, y_train) + except Exception as e: + print(str(e)) + default_row = [algorithm,regularization,fold,0,0,0,0,0,0] + temp_file.write(",".join(str(e) for e in default_row) + "\n") + print(default_row) + print("The following data subset caused an error: error.csv") + print(X_train, y_train) + X_train.insert(m, "class", y_train) # It is expected that the last column is the label column + output = pd.DataFrame(X_train) + output.to_csv("error.csv", index=False) + exit(1) + else: + train = model.error(X_train, y_train) + test = model.error(X_test, y_test) + row = [ + algorithm,regularization,fold, + train,test, + model.max_depth(), model.leaves(), model.nodes(), + model.utime + ] + print(row) + temp_file.write(",".join(str(e) for e in row) + "\n") + fold += 1 + + if algorithm == "dl85": + trial(lambda : DL85(depth=depth_budget, regularization = regularization, time_limit=configuration["time_limit"], preprocessor="complete")) + + elif algorithm == "dl85_no_min_sup": + trial(lambda : DL85(depth=depth_budget, time_limit=configuration["time_limit"], preprocessor="complete")) + + elif algorithm == "osdt": + config = { + "regularization": regularization, + "time_limit": configuration["time_limit"] + } + trial(lambda : OSDT(config, preprocessor="complete")) + + elif algorithm == "gosdt": + config = { + "regularization": regularization, + "similar_support": False, + "strong_indifference": False, + "time_limit": configuration["time_limit"], + "depth_budget": depth_budget + } + trial(lambda : GOSDT(config)) + + elif algorithm == "gosdt_count_subproblems": + config = { + "regularization": regularization, + "similar_support": False, + "strong_indifference": False, + "time_limit": configuration["time_limit"], + "depth_budget": depth_budget + } + trial(lambda : GOSDT(config)) + + elif algorithm == "corels": + trial(lambda : CORELS(regularization=regularization)) + + temp_file.close() # temp file is complete + os.rename(temp_file_path, result_file_path) # commit this file + print("Trials Completed:", result_file_path) diff --git a/python/collectors/run_single_configuration.py b/python/collectors/run_single_configuration.py new file mode 100644 index 0000000..c9b0595 --- /dev/null +++ b/python/collectors/run_single_configuration.py @@ -0,0 +1,718 @@ +import numpy as np +import csv +import os +import pathlib +import platform +from datetime import datetime +import hashlib +import json +from resource import getrusage, RUSAGE_SELF +import time + +import numpy as np +import pandas as pd +import time +from model.cart import CART +from model.corels import CORELS +from model.dl85 import DL85 +from model.gosdt import GOSDT +from model.osdt import OSDT +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.model_selection import KFold, cross_val_score + + +# runs a single configuration on the loaded dataset, with the supplied configuration +def do_run_config(X_train, y_train, X_test, y_test, path_to_labels, cfg) : + print("preparing algorithm.", flush=True) + # now create the models + if cfg['algorithm'] == "dl85" or cfg['algorithm'] == "dl8.5" : + model = DL85(depth=cfg['depth_budget'], # subtract one to limit the depth + regularization = cfg['regularization'], + time_limit=cfg['time_limit'], + preprocessor="none", + warm = None if path_to_labels is None else np.genfromtxt(path_to_labels, skip_header=1)) #the warm labels for dl8.5 require an array, not a file pointer + + elif cfg['algorithm'] == "dl85_no_min_sup" or cfg['algorithm'] == "dl8.5_no_min_sup": + model = DL85(depth=cfg['depth_budget'], + time_limit=cfg['time_limit'], + preprocessor="none", + warm = None if path_to_labels is None else np.genfromtxt(path_to_labels, skip_header=1)) #the warm labels for dl8.5 require an array, not a file pointer + + elif cfg['algorithm'] == "osdt": + config = { + "regularization": cfg['regularization'], + "time_limit": cfg['time_limit'] + } + model = OSDT(config, preprocessor="complete") + + elif cfg['algorithm'] == "gosdt": + config = { + "regularization": cfg['regularization'], + "similar_support": False, + "strong_indifference": False, + "time_limit": cfg['time_limit'], + "depth_budget": cfg['depth_budget'] + } + if cfg['lb_guess']: + config['warm_LB'] = True + config['path_to_labels'] = path_to_labels + model = GOSDT(config) + + elif cfg['algorithm'] == "gosdt_count_subproblems": + config = { + "regularization": cfg['regularization'], + "similar_support": False, + "strong_indifference": False, + "time_limit": cfg['time_limit'], + "depth_budget": cfg['depth_budget'] + } + model = GOSDT(config) + + elif cfg['algorithm'] == "corels": + model = CORELS(regularization=cfg['regularization']) + elif cfg['algorithm'] == "cart": + model = CART(depth=cfg['depth_budget'], regularization=cfg['regularization']) + else : + print("unrecognized algorithm: {}".format(cfg['algorithm'])) + raise Exception + + + # ok now train the model, may throw an exception + print("training model", flush=True) + model.fit(X_train, y_train) + + print("evaluate the model, extracting tree and scores", flush=True) + + # store the results + cfg["train_acc"] = model.score(X_train, y_train) + cfg["train_err"] = model.error(X_train, y_train) + cfg["test_acc"] = model.score(X_test, y_test) + cfg["test_err"] = model.error(X_test, y_test) + cfg["depth"] = model.max_depth() + cfg["leaves"] = model.leaves() + cfg["nodes"] = model.nodes() + cfg["time"] = model.utime + cfg['systime'] = model.stime + cfg["lb"] = model.lb + cfg["ub"] = model.ub + cfg["loss"] = model.reported_loss + + return cfg, model.json() + + +# returns the basename to store the results, we just return a hash here +def confighash(cfg) : + # don't take the idx and optimal depth as part of the hash + idx = cfg['idx'] + optdepth = cfg['optimaldepth'] + cfg['idx'] = 0 + cfg['optimaldepth'] = 0 + hash = hashlib.sha256(json.dumps(cfg, indent=2, sort_keys=True).encode('utf-8')).hexdigest() + cfg['idx'] = idx + cfg['optimaldepth'] = optdepth + cfg['hash'] = hash + return hash + + # common = "{}_{}_{}_{}_{}".format(cfg['dataset'], cfg['algorithm'], cfg['regularization'], cfg['depth_budget'], cfg['fold']) + # if cfg["thresh"] : + # common = "{}_{}-{}-{}".format(common, "thresh", cfg['n_est'], cfg['max_depth']) + + # if cfg["lb_guess"]: + # common = "{}_{}-{}-{}".format(common, "lbguess", cfg['lb_n_est'], cfg['lb_max_depth']) + + # return common + +# fit the tree using gradient boosted classifier +def fit_boosted_tree(X, y, n_est=10, lr=0.1, d=1): + clf = GradientBoostingClassifier(loss='deviance', learning_rate=lr, n_estimators=n_est, max_depth=d, + random_state=42) + clf.fit(X, y) + out = clf.score(X, y) + scores = cross_val_score(clf, X, y, cv=5) + return clf, out, scores + + +# perform cut on the dataset +def cut(X, ts): + df = X.copy() + colnames = X.columns + for j in range(len(ts)): + for s in range(len(ts[j])): + X[colnames[j]+'<='+str(ts[j][s])] = 1 + k = df[colnames[j]] > ts[j][s] + X.loc[k, colnames[j]+'<='+str(ts[j][s])] = 0 + X = X.drop(colnames[j], axis=1) + return X + + +# compute the thresholds +def get_thresholds(X, y, n_est, lr, d, backselect=True): + # got a complaint here... + y = np.ravel(y) + # X is a dataframe + clf, out, score = fit_boosted_tree(X, y, n_est, lr, d) + #print('acc:', out, 'acc cv:', score.mean()) + thresholds = [] + for j in range(X.shape[1]): + tj = np.array([]) + for i in range(len(clf.estimators_)): + f = clf.estimators_[i,0].tree_.feature + t = clf.estimators_[i,0].tree_.threshold + tj = np.append(tj, t[f==j]) + tj = np.unique(tj) + thresholds.append(tj.tolist()) + + X_new = cut(X, thresholds) + clf1, out1, scorep = fit_boosted_tree(X_new, y, n_est, lr, d) + #print('acc','1:', out1, 'acc1 cv:', scorep.mean()) + + outp = 1 + Xp = X_new.copy() + clfp = clf1 + itr=0 + if backselect: + while outp >= out1 and itr < X_new.shape[1]-1: + vi = clfp.feature_importances_ + if vi.size > 0: + c = Xp.columns + i = np.argmin(vi) + Xp = Xp.drop(c[i], axis=1) + clfp, outp, scorep = fit_boosted_tree(Xp, y, n_est, lr, d) + itr += 1 + else: + break + Xp[c[i]] = X_new[c[i]] + _, _, scorep = fit_boosted_tree(Xp, y, n_est, lr, d) + + h = Xp.columns + #print('features:', h) + return Xp, thresholds, h, scorep + +# compute the thresholds +def compute_thresholds(X, y, cfg) : + # set LR to 0.1 + lr = 0.1 + start = time.perf_counter() + X, thresholds, header, score = get_thresholds(X, y, cfg['n_est'], lr, cfg['max_depth'], backselect=True) + guess_time = time.perf_counter()-start + + # store the guess time + cfg['guesstime'] = guess_time + cfg['guess_acc'] = score.mean() + return X, thresholds, header, cfg + +def load_dataset(file) : + tstart = time.perf_counter() + datasetfile = file.with_suffix(file.suffix + ".feather") + if datasetfile.exists() : + df = pd.read_feather(datasetfile) + else : + datasetfile = file.with_suffix(file.suffix + ".csv.gz") + df = pd.read_csv(datasetfile, sep=";") + tend = time.perf_counter() + print(f"dataset file: {datasetfile}") + print(f"duration loading: {tend - tstart}s", flush=True) + return df + + + +def get_thresh_dataset(cfg, datasetdir, lb_guess = False) : + print("dataset: getting threshold dataset {}".format(cfg['dataset'])) + + df = load_dataset(datasetdir / "original_datasets" / cfg["dataset"] / cfg["dataset"]) + + X, y = df.iloc[:,:-1].values, df.iloc[:,-1].values + h = df.columns[:-1] + + if cfg['fold'] is not None: + # 5-fold cross validation + kf = KFold(n_splits=5, shuffle=True, random_state=2021) + indices = list(kf.split(X)) + (train_index, test_index) = indices[cfg['fold']] + + X_train = pd.DataFrame(X[train_index], columns=h) + y_train = pd.DataFrame(y[train_index]) + X_test = pd.DataFrame(X[test_index], columns=h) + y_test = pd.DataFrame(y[test_index]) + + X_train, thresholds, header, cfg = compute_thresholds(X_train, y_train, cfg) + + # get test dataset + X_test = cut(X_test, thresholds) + X_test = X_test[header] + else : + X = pd.DataFrame(X, columns=h) + X_train, thresholds, header, cfg = compute_thresholds(X, y, cfg) + y_train = pd.DataFrame(y) + X_test = X_train + y_test = y_train + + train_index = np.arange(0, X_train.shape[0]) + test_index = np.arange(0, X_train.shape[0]) + + return X_train, y_train, X_test, y_test, cfg + + + +def get_non_thresh_dataset(cfg, datasetdir) : + print("dataset: getting non threshold dataset {}".format(cfg['dataset'])) + if cfg['fold'] is not None: + testfile = datasetdir / "binarized_datasets" / cfg['dataset'] / '{}_binarized_datasets.test{}'.format(cfg['dataset'], cfg['fold']) + trainfile = datasetdir / "binarized_datasets" / cfg['dataset'] / '{}_binarized_datasets.train{}'.format(cfg['dataset'], cfg['fold']) + print(f"dataset: testfile {testfile}") + print(f"dataset: trainfile {trainfile}") + train_csv = load_dataset(trainfile) + test_csv = load_dataset(testfile) + X_train = pd.DataFrame(train_csv.iloc[:,:-1]) + X_test = pd.DataFrame( test_csv.iloc[:,:-1]) + y_train = pd.DataFrame(train_csv.iloc[:,-1]) + y_test = pd.DataFrame(test_csv.iloc[:,-1]) + else : + datasetfile = datasetdir / "binarized_datasets" / cfg['dataset'] / '{}_binarized_datasets'.format(cfg['dataset']) + print(f"dataset: {datasetfile}") + train_csv = load_dataset(datasetfile) + X_train = pd.DataFrame(train_csv.iloc[:,:-1]) + y_train = pd.DataFrame(train_csv.iloc[:,-1]) + X_test = X_train + y_test = y_train + + return X_train, y_train, X_test, y_test + +def get_lb_guess_dataset_with_labels(cfg, datasetdir, correctness = False, thresh = True) : + if thresh: + #in thresh case, we currently train the warm lb GBDT on the thresholded dataset. + X_train, y_train, X_test, y_test, cfg = get_thresh_dataset(cfg, datasetdir) + else: + #in non-thresh case, we train on the original, not the binarized, dataset (faster training time, same space of models considered) + #TODO: decide whether we need to include the time to load the original dataset in our reported time for warm_lb + df = load_dataset(datasetdir / "original_datasets" / cfg["dataset"] / cfg["dataset"]) + + X, y = df.iloc[:,:-1].values, df.iloc[:,-1].values + h = df.columns[:-1] + + if cfg['fold'] is not None: + # 5-fold cross validation + kf = KFold(n_splits=5, shuffle=True, random_state=2021) + indices = list(kf.split(X)) + (train_index, test_index) = indices[cfg['fold']] + + X_train = pd.DataFrame(X[train_index], columns=h) + y_train = pd.DataFrame(y[train_index]) + X_test = pd.DataFrame(X[test_index], columns=h) + y_test = pd.DataFrame(y[test_index]) + + else : + X_train = pd.DataFrame(X, columns=h) + y_train = pd.DataFrame(y) + X_test = X_train + y_test = y_train + + print("dataset: getting warm lower-bound labels for {}".format(cfg['dataset'])) + + start_time = time.perf_counter() + #make warm labels (predictions of GBDT for the training set - TODO: handle multiclass) + clf = GradientBoostingClassifier(n_estimators=cfg['lb_n_est'], max_depth=cfg['lb_max_depth'], random_state=42) + clf.fit(X_train, y_train.values.flatten()) + warm_labels = clf.predict(X_train) + if correctness: + # use alternative representation of warm_labels if required (1 means correct prediction, 0 means incorrect) + correct_preds = np.zeros(len(warm_labels)) + correct_preds[warm_labels == y_train.values.flatten()] = 1 + warm_labels = correct_preds + elapsed_time = time.perf_counter() - start_time + + cfg['lb_time'] = elapsed_time + + print(f"dataset warm lb: {elapsed_time}") + + test_preds = clf.predict(X_test) + cfg['lb_test_acc'] = np.mean(test_preds == y_test.values.flatten()) + cfg['lb_test_err'] = 1 - cfg['lb_test_acc'] + cfg['lb_train_acc'] = np.mean(clf.predict(X_train) == y_train.values.flatten()) + cfg['lb_train_err'] = 1 - cfg['lb_train_acc'] + + + # save the labels as a tmp file and return the path to it. + labelsdir = pathlib.Path('/tmp/warm_lb_labels') + labelsdir.mkdir(exist_ok=True, parents=True) + + cfghash = cfg['hash'] + labelpath = labelsdir / 'treebench-{}.tmp'.format(cfghash) + pd.DataFrame(warm_labels).to_csv(labelpath, header="class_labels",index=None) # TODO: verify this formats correctly for gosdt (shouldn't require headers) + + #if thresh is false, we haven't yet loaded the train and test sets to be passed to gosdt. We do so now + # (ASSUMES THE ORDER OF THE DATA POINTS IS THE SAME IN THE ORIGINAL AND BINARIZED DATASETS) + if not thresh: + X_train, y_train, X_test, y_test = get_non_thresh_dataset(cfg, datasetdir) + + return X_train, y_train, X_test, y_test, str(labelpath), cfg + +def asbool(val : str): + if val is None or val == "None": + return False + if str(val).lower() == "true" : + return True + elif str(val).lower() == 'false' : + return False + print(f"WARNING: calling default bool() function on {val}") + return bool(val) + +def asint(val): + if val == "None" or val is None or val == "" : + return None + else: + return int(val) + +def update_errorfile(error_file_path, cfg) : + + try : + error_file = open(error_file_path, "w") + except Exception as e: + print("ERROR: could not open file {} for writing".format(error_file_path)) + raise e + + # write the error file, this is to find the runs that caused an error + error_file_writer = csv.DictWriter(error_file, fieldnames=cfg.keys()) + error_file_writer.writeheader() + error_file_writer.writerow(cfg) + error_file.close() + +# run a single configuration +def run_single_configuration(cfg, force = False) : + + tstart = time.perf_counter() + + # --------------------------------------------------------------------------------------------- + # CONFIG SETUP + # --------------------------------------------------------------------------------------------- + + # default configuration values for fields not present int `cfg` + default_cfg = { + # the index of the configuration + "idx" : (asint, 0), + + # DATASET SETTINGS + + # the used dataset + "dataset" : (str, 'compass'), + # whether or not we used threshold guessing + "thresh" : (asbool, False), + # whether or not we used lowerbound guesses + "lb_guess" : (asbool, False), + # which fold of the dataset was used + "fold" : (asint, 0), + + # ALGORITHM SETTINGS + + # the used algorithm + "algorithm" : (str, "gosdt"), + # the used regularization + "regularization": (float, 0.001), + # the depth budget + "depth_budget" : (asint, 0), + # the known optimal depth for this configuration + "optimaldepth" : (asint, 0), + # the timelimit to for running the algorithm + "time_limit" : (asint, 1800), + + # HYPER PARAMETERS FOR GUESSING + + # the maximum depth used for creating the thresholded dataset + "max_depth" : (asint, 2), + # the number of estimators for for creating the thresholded dataset + "n_est" : (asint, 100), + # the maximum depth used for the warm lb labels + "lb_max_depth" : (asint, 2), + # the number of estimators for the warm lb labels + "lb_n_est" : (asint, 100), + # did we use cross validation + "cross_validate" : (asbool, True), + } + # RESULTS + results_template = { + # the host name on which the run took place + "hostname" : platform.node(), + # timestamp when it ran + "datetime" : datetime.now().strftime("%Y%m%d-%H%M%S"), + + # the cpu information + "cpu" : "unknown", + + # the versions for tree benchmark and gosdt + "gosdtversion" : 'unknown', + "treebenchmarkversion" : 'unknown', + + # time to load the dataset + "dateset_loading" : -1, + + # subsamples of the dataset + "subsamples" : None, + # the binary sub features + "binsubfeatures" : None, + # training accuracy + "train_acc" : -1, + # training error + "train_err" : -1, + # test accuracy + "test_acc" : -1, + # test error + "test_err" : -1, + # the depth of the tree + "depth" : -1, + # the number of leaves + "leaves" : -1, + # the number of nodes + "nodes" : -1, + # the runtime + "time" : -1, + # system time + "systime" : -1, + # the maximum memory usage + "mem_mb" : -1, + # time to perform the guessing + "guesstime" : -1, + # accurancy of the guessing + "guess_acc" : -1, + # time to generate the lables + "lb_time" : -1, + # train accuracy from the reference model generating the labels + "lb_train_acc": -1, + # train error from the reference model generating the labels + "lb_train_err": -1, + # test accuracy from the reference model generating the labels + "lb_test_acc": -1, + # train error from the reference model generating the labels + "lb_test_err": -1, + # the lower bound of the tree + "lb" : -1, + # the upperbound of the tree + "ub" : -1, + # the loss of the tree + "loss" : -1, + # the status of the run + "status" : "PENDING" + } + + for k in default_cfg : + (cast, val) = default_cfg[k] + if k not in cfg: + print("NOTICE: config - using default value for field field '{}' = '{}'".format(k, val)) + cfg[k] = val + else : + cfg[k] = cast(cfg[k]) + + for k in cfg : + if k not in default_cfg : + print(f"NOTICE: config - field '{k}' part of cfg, but not default_cfg") + + print("############################################################", flush = True) + print("# {} - {} - {} - {} - {}".format(cfg['dataset'], cfg['algorithm'], cfg['fold'], cfg['depth_budget'], cfg['regularization'])) + print("############################################################", flush = True) + + # dump the configuration + print(json.dumps(cfg, indent=2,sort_keys=True)) + + # --------------------------------------------------------------------------------------------- + # RESULT FILES + # --------------------------------------------------------------------------------------------- + + # Set the results file names + resultsdirroot = pathlib.Path("results") + + # get the configuration hash + cfghash = confighash(cfg) + print(f"config_hash: {cfghash}") + print(f"config_hash: {cfg['hash']}") + print(f"config_id: {cfg['idx']}") + + # get the results directory + resultsdir = resultsdirroot / "runlogs" / cfghash[0:2] / cfghash + + # create the results directory + resultsdir.mkdir(parents = True, exist_ok= True) + + # the actual output files + result_file_path = resultsdir / "results.csv" + error_file_path = resultsdir / "error.csv" + temp_file_path = resultsdir / "results.tmp" + tree_file_path = resultsdir / "tree.json" + cfg_file_path = resultsdir / "config.json" + + # check if we've already run this configuration + if result_file_path.exists() and not force: + print("already run. (success) skipping...") + return # This set of trials is already complete + + # is there a temp file, that indicates that we may have had an error + if temp_file_path.exists() and not force: + print("already run. (with failure) skipping...") + return # This set of trials is already complete + + # get the CPU information + for line in os.popen('lscpu').read().splitlines() : + if not line.startswith('Model name') : + continue + results_template['cpu'] = line.split(':')[1].strip() + break + + # store the versions + if 'TB_ENV_GOSDT_VERSION' in os.environ : + results_template['gosdtversion'] = os.environ['TB_ENV_GOSDT_VERSION'] + else : + print("NOTE: environment TB_ENV_GOSDT_VERSION is not set!") + + if 'TB_ENV_TREE_BENCHMARK_VERSION' in os.environ : + results_template['treebenchmarkversion'] = os.environ['TB_ENV_TREE_BENCHMARK_VERSION'] + else : + print("NOTE: environment TB_ENV_TREE_BENCHMARK_VERSION is not set!") + + # write the configuration map + configmapdir = resultsdirroot / "configmap" + configmapdir.mkdir(parents = True, exist_ok= True) + configmap = configmapdir / str(cfg['idx']) + with open(configmap, 'w') as f: + f.write(str(cfghash)) + f.close() + + # store the configuration json + with open(cfg_file_path, 'w') as f: + json.dump(cfg, f, indent=2) + + # now copy the results fields + for k in results_template : + if k in cfg: + print("WARNING: field '{}' already in the cfg file???".format(k)) + cfg[k] = results_template[k] + + + # --------------------------------------------------------------------------------------------- + # Open the result files + # --------------------------------------------------------------------------------------------- + + # try to create the json file to store the tree + try: + tree_file = open(tree_file_path, "w") + except Exception as e: + print("ERROR: could not open file {} for writing".format(tree_file_path)) + raise e + + # create the CSV file for writing the results + try : + temp_file = open(temp_file_path, "w") + except Exception as e: + print("ERROR: could not open file {} for writing".format(temp_file_path)) + raise e + + update_errorfile(error_file_path, cfg) + + # write the header to the CSV file + temp_file_writer = csv.DictWriter(temp_file, fieldnames=cfg.keys()) + temp_file_writer.writeheader() + + # --------------------------------------------------------------------------------------------- + # DATASET LOADING + # --------------------------------------------------------------------------------------------- + + tdataset = time.perf_counter() + print(f"duration preparations: {tdataset - tstart}s", flush=True) + + # the root directories of the datasets and the results + datasetdir = pathlib.Path("datasets") + + # the generated labels path + path_to_labels = None + + # set the current run status + cfg['status'] = "DATASET_LOADING" + + try : + # we want to run the lowerbound guess datasets + if cfg['lb_guess'] : + # if we're using dl85, we need the warm labels to indicate correctness of the predictions, + # not the predictions themselves. we handle the algorithm with and without a dot here. + correctness = cfg['algorithm'].startswith("dl85") or cfg['algorithm'].startswith("dl8.5") + + X_train, y_train, X_test, y_test, path_to_labels, cfg = get_lb_guess_dataset_with_labels(cfg, datasetdir, correctness, cfg['thresh']) + + # we use thresholding, but no warm_lb dataset + elif cfg['thresh']: + X_train, y_train, X_test, y_test, cfg = get_thresh_dataset(cfg, datasetdir) + + # no thresholding, running on the binarized datasets + else : + X_train, y_train, X_test, y_test = get_non_thresh_dataset(cfg, datasetdir) + except Exception as e: + print(f"ERROR: Failed to load dataset: {cfg['dataset']}", flush=True) + print(str(e)) + cfg['status'] = "DATASET_FAILURE" + temp_file_writer.writerow(cfg) + temp_file.close() + error_file_path.unlink() + return + + n, p = X_train.shape + # subsamples of the dataset + cfg["subsamples"] = n + # the binary sub features + cfg["binsubfeatures"] = p + + trun = time.perf_counter() + print(f"duration dataset: {trun - tdataset}s", flush=True) + cfg['dateset_loading'] = trun - tdataset + + update_errorfile(error_file_path, cfg) + + # --------------------------------------------------------------------------------------------- + # Run the configuration + # --------------------------------------------------------------------------------------------- + + # set the current run status + cfg['status'] = "RUNNING" + + try : + result, tree = do_run_config(X_train, y_train, X_test, y_test, path_to_labels, cfg) + except Exception as e: + print(f"ERROR: Failed to run algorithm `{cfg['algorithm']}`", flush=True) + print(str(e)) + cfg['status'] = "RUN_FAILURE" + temp_file_writer.writerow(cfg) + temp_file.close() + error_file_path.unlink() + return + + # get the resource usage, + s = getrusage(RUSAGE_SELF) + # ru_maxrss is at pos 2, and the amount is in KiB + result['mem_mb'] = s[2] / 1024 + result['status'] = "OK" + + tmodel = time.perf_counter() + print(f"duration model: {tmodel - tdataset}s", flush=True) + + # --------------------------------------------------------------------------------------------- + # Save the results + # --------------------------------------------------------------------------------------------- + + # write the results and close file + temp_file_writer.writerow(result) + temp_file.close() + + # write the tree file + tree_file.write(tree) + tree_file.close() + + # now rename the fle to mark the results + os.rename(temp_file_path, result_file_path) # commit this file + + # remove the error file + error_file_path.unlink() + + # --------------------------------------------------------------------------------------------- + # Clean up temp files + # --------------------------------------------------------------------------------------------- + + if path_to_labels is not None: + pathlib.Path(path_to_labels).unlink() + + tend = time.perf_counter() + print(f"duration running: {tend - tstart}s", flush=True) diff --git a/python/example.py b/python/example.py index be3e4d6..e4967c0 100644 --- a/python/example.py +++ b/python/example.py @@ -28,7 +28,7 @@ model = GOSDT(hyperparameters) model.fit(X, y) # model.load("models/iris_error.json") -print("Execution Time: {}".format(model.time)) +print("Execution Time: {}".format(model.utime)) prediction = model.predict(X) training_accuracy = model.score(X, y) diff --git a/python/model/binoct.py b/python/model/binoct.py index e9319ec..e1f9931 100644 --- a/python/model/binoct.py +++ b/python/model/binoct.py @@ -20,6 +20,10 @@ def __init__(self, preprocessor="complete", regularization=None, depth=1, time_l self.regularization = regularization self.depth = depth self.time_limit = time_limit + self.lb = -1 + self.ub = -1 + self.loss = -1 + self.reported_loss = -1 def fit(self, X, y): @@ -55,7 +59,7 @@ def fit(self, X, y): remove(data_path) remove(data_path + ".json") return self - + def __translate__(self, tree, id=0, depth=-1): n_nodes = tree.node_count children_left = tree.children_left @@ -101,7 +105,7 @@ def json(self): def binary_features(self): return len(self.encoder.headers) - + def __len__(self): return len(self.tree) @@ -113,7 +117,7 @@ def nodes(self): def max_depth(self): return self.tree.maximum_depth() - + def regularization_upperbound(self, X, y): return self.tree.regularization_upperbound(X, y) diff --git a/python/model/cart.py b/python/model/cart.py index 9a7f4be..0af8367 100644 --- a/python/model/cart.py +++ b/python/model/cart.py @@ -8,13 +8,17 @@ class CART: - def __init__(self, preprocessor="complete", regularization=None, depth=1, support=1, width=1): + def __init__(self, preprocessor="none", regularization=None, depth=1, support=1, width=1): # Precompute serialized configuration - self.preprocessor = "complete" + self.preprocessor = preprocessor self.regularization = regularization self.depth = depth self.support = support self.width = width + self.lb = -1 + self.ub = -1 + self.loss = -1 + self.reported_loss = -1 def fit(self, X, y): (n, m) = X.shape @@ -28,16 +32,18 @@ def fit(self, X, y): if not self.regularization is None: regularization = self.regularization config = { - "max_depth": floor(1 / regularization) if regularization > 0 else None, - "min_samples_split": max(ceil(regularization * 2 * n), 2), - "min_samples_leaf": max(1, ceil(regularization * n)), - "max_leaf_nodes": floor(1 / (2 * regularization)) if regularization > 0 else None, - "min_impurity_decrease": regularization + # "max_depth": floor(1 / regularization) if regularization > 0 else None, + # "min_samples_split": max(ceil(regularization * 2 * n), 2), + # "min_samples_leaf": max(1, ceil(regularization * n)), + # "max_leaf_nodes": floor(1 / (2 * regularization)) if regularization > 0 else None, + # "min_impurity_decrease": regularization + "max_depth": self.depth-1, #cart's notion of depth excludes the root node, so we correct for an OBOE here + "min_samples_leaf": ceil(self.regularization * n) } model = DecisionTreeClassifier(**config) else: config = { - "max_depth": self.depth, + "max_depth": self.depth-1, #cart's notion of depth excludes the root node, so we correct for an OBOE here "min_samples_leaf": self.support, "max_leaf_nodes": self.width } @@ -46,10 +52,12 @@ def fit(self, X, y): start = time.perf_counter() model.fit(X, y) self.time = time.perf_counter() - start + self.utime = self.time + self.stime = 0 source = self.__translate__(model.tree_) self.tree = TreeClassifier(source, encoder=encoder, X=X, y=y) return self - + def __translate__(self, tree, id=0, depth=-1): n_nodes = tree.node_count children_left = tree.children_left @@ -94,7 +102,7 @@ def json(self): def binary_features(self): return len(self.encoder.headers) - + def __len__(self): return len(self.tree) @@ -106,9 +114,9 @@ def nodes(self): def max_depth(self): return self.tree.maximum_depth() - + def regularization_upperbound(self, X, y): return self.tree.regularization_upperbound(X, y) def features(self): - return self.tree.features() \ No newline at end of file + return self.tree.features() diff --git a/python/model/corels.py b/python/model/corels.py index 96d09b3..57932e8 100644 --- a/python/model/corels.py +++ b/python/model/corels.py @@ -14,6 +14,10 @@ def __init__(self, preprocessor="complete", search="-b", regularization=0.01, ma self.max_nodes = max_nodes self.symmetry = symmetry self.preprocessor = preprocessor + self.lb = -1 + self.ub = -1 + self.loss = -1 + self.reported_loss = -1 def fit(self, X, y): self.shape = X.shape @@ -29,6 +33,8 @@ def fit(self, X, y): start = time.perf_counter() c.fit(X, y.values.ravel()) self.time = time.perf_counter() - start + self.utime = self.time + self.stime = 0 # train = c.score(X, y.values.ravel()) source = self.__translate__(c.rl().rules) diff --git a/python/model/dl85.py b/python/model/dl85.py index c3d719c..0f95983 100644 --- a/python/model/dl85.py +++ b/python/model/dl85.py @@ -9,13 +9,18 @@ class DL85: - def __init__(self, preprocessor="complete", regularization=None, depth=1, support=1, time_limit=900): + def __init__(self, preprocessor="none", regularization=None, depth=2**30, support=1, time_limit=900, warm = None): # Precompute serialized configuration self.preprocessor = preprocessor self.regularization = regularization - self.depth = depth + self.depth = depth if depth != 0 else 2**30 #use depth 0 to convey no depth limit self.support = support self.time_limit = time_limit + self.warm = warm #represents the reference model's performance on the training set points + self.lb = -1 + self.ub = -1 + self.loss = -1 + self.reported_loss = -1 def fit(self, X, y): self.shape = X.shape @@ -29,32 +34,36 @@ def fit(self, X, y): self.encoder = encoder if not self.regularization is None: - depth = ceil(1 / self.regularization) if self.regularization > 0 else 2**30 + #in gosdt, regularization automatically implies a bound on depth - we apply that here too for fairness + # depth_constraint_from_reg = ceil(1 / self.regularization) if self.regularization > 0 else 2**30 + # depth = min(depth_constraint_from_reg, self.depth) #if the user also specified a depth constraint, we should use the tighter of these two constraints support = ceil(self.regularization * n) - - def error(sup_iter): - supports = list(sup_iter) - maxindex = np.argmax(supports) - return sum(supports) - supports[maxindex] + self.regularization * n, maxindex + + # def error(sup_iter): + # supports = list(sup_iter) + # maxindex = np.argmax(supports) + # return sum(supports) - supports[maxindex] + self.regularization * n, maxindex clf = DL85Classifier( - fast_error_function=error, - iterative=True, + # fast_error_function=error, + iterative=False, time_limit=self.time_limit, min_sup=support, - max_depth=depth + max_depth=self.depth-1 #dl8.5's notion of depth excludes the root node, so we correct for an OBOE here ) else: clf = DL85Classifier( - iterative=True, + iterative=False, time_limit=self.time_limit, min_sup=self.support, - max_depth=self.depth + max_depth=self.depth-1 #as above, correct for OBOE ) start = time.perf_counter() - clf.fit(X, y.values.ravel()) + clf.fit(X, y.values.ravel(), warm=self.warm) self.time = time.perf_counter() - start + self.utime = self.time + self.stime = 0 self.space = clf.lattice_size_ source = self.__translate__(clf.tree_) self.tree = TreeClassifier(source, encoder=encoder) @@ -79,8 +88,8 @@ def __translate__(self, node): "complexity": self.regularization } elif "feat" in node: - return { - "feature": node["feat"], + return { + "feature": node["feat"], "name": self.encoder.headers[node["feat"]], "relation": "==", "reference": 1, @@ -88,7 +97,7 @@ def __translate__(self, node): "false": self.__translate__(node["right"]) } else: - raise "Formatting Error: {}".format(str(node)) + raise Exception("Formatting Error: {}".format(str(node))) def predict(self, X): return self.tree.predict(X) @@ -110,7 +119,7 @@ def json(self): def binary_features(self): return len(self.encoder.headers) - + def __len__(self): return len(self.tree) @@ -122,9 +131,9 @@ def nodes(self): def max_depth(self): return self.tree.maximum_depth() - + def regularization_upperbound(self, X, y): return self.tree.regularization_upperbound(X, y) def features(self): - return self.tree.features() \ No newline at end of file + return self.tree.features() diff --git a/python/model/ghoul.py b/python/model/ghoul.py new file mode 100644 index 0000000..8f46c2c --- /dev/null +++ b/python/model/ghoul.py @@ -0,0 +1,364 @@ +import json +import pandas as pd +import time +from numpy import array +from sklearn.metrics import confusion_matrix, accuracy_score + +import gosdt # Import the GOSDT extension +from model.encoder import Encoder +from .imbalance.osdt_imb_v9 import bbound, predict # Import the special objective implementation +from .tree_classifier import TreeClassifier # Import the tree classification model + +#currently, just an implementation of gosdt that can use upper bound guesses and does not guarantee it will return a tree + +class GHOUL: + def __init__(self, configuration={}): + self.configuration = configuration + self.time = 0.0 + self.iterations = 0 + self.size = 0 + self.tree = None + self.encoder = None + + def load(self, path): + """ + Parameters + --- + path : string + path to a JSON file representing a model + """ + with open(path, 'r') as model_source: + result = model_source.read() + result = json.loads(result) + self.tree = TreeClassifier(result[0]) + + def __train__(self, X, y): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + matrix containing the training samples and features + y : array-like, shape = [n_samples by 1] + column containing the correct label for each sample in X + + Modifies + --- + trains a model using the GOSDT native extension + """ + (n, m) = X.shape + dataset = X.copy() + dataset.insert(m, "class", y) # It is expected that the last column is the label column + + gosdt.configure(json.dumps(self.configuration, separators=(',', ':'))) + result = gosdt.fit(dataset.to_csv(index=False)) # Perform extension call to train the model + + if gosdt.status() != 0: + raise "Error: GOSDT encountered an error while training" + + result = json.loads(result) # Deserialize result + #self.tree = TreeClassifier(result[0]) # Parse the first result into model + + self.time = gosdt.time() # Record the training time + self.iterations = gosdt.iterations() # Record the number of iterations + self.size = gosdt.size() # Record the graph size required + + def fit(self, X, y): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + matrix containing the training samples and features + y : array-like, shape = [n_samples by 1] + column containing the correct label for each sample in X + + Modifies + --- + trains the model so that this model instance is ready for prediction + """ + # if "objective" in self.configuration: + # if self.configuration["objective"] == "acc": + # self.configuration["theta"] = None + # self.configuration["w"] = None + # elif self.configuration["objective"] == "bacc": + # self.configuration["theta"] = None + # self.configuration["w"] = None + # elif self.configuration["objective"] == "wacc": + # self.configuration["theta"] = None + # elif self.configuration["objective"] == "f1": + # self.configuration["theta"] = None + # self.configuration["w"] = None + # elif self.configuration["objective"] == "auc": + # self.configuration["theta"] = None + # self.configuration["w"] = None + # elif self.configuration["objective"] == "pauc": + # self.configuration["w"] = None + # else: + # raise "Error: GOSDT does not support this accuracy objective" + # self.__python_train__(X, y) + # else: + self.__train__(X, y) + return self + + def predict(self, X): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + a matrix where each row is a sample to be predicted and each column is a feature to be used for prediction + + Returns + --- + array-like, shape = [n_sampels by 1] : a column where each element is the prediction associated with each row + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.predict(X) + + def error(self, X, y, weight=None): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + an n-by-m matrix of sample and their features + y : array-like, shape = [n_samples by 1] + an n-by-1 column of labels associated with each sample + weight : real number + an n-by-1 column of weights to apply to each sample's misclassification + + Returns + --- + real number : the inaccuracy produced by applying this model overthe given dataset, with optionals for weighted inaccuracy + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.error(X, y, weight=weight) + + def score(self, X, y, weight=None): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + an n-by-m matrix of sample and their features + y : array-like, shape = [n_samples by 1] + an n-by-1 column of labels associated with each sample + weight : real number + an n-by-1 column of weights to apply to each sample's misclassification + + Returns + --- + real number : the accuracy produced by applying this model overthe given dataset, with optionals for weighted accuracy + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.score(X, y, weight=weight) + + def confusion(self, X, y, weight=None): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + an n-by-m matrix of sample and their features + y : array-like, shape = [n_samples by 1] + an n-by-1 column of labels associated with each sample + weight : real number + an n-by-1 column of weights to apply to each sample's misclassification + + Returns + --- + matrix-like, shape = [k_classes by k_classes] : the confusion matrix of all classes present in the dataset + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.confusion(self.predict(X), y, weight=weight) + + def __len__(self): + """ + Returns + --- + natural number : The number of terminal nodes present in this tree + """ + if self.tree is None: + raise "Error: Model not yet trained" + return len(self.tree) + + def leaves(self): + """ + Returns + --- + natural number : The number of terminal nodes present in this tree + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.leaves() + + def nodes(self): + """ + Returns + --- + natural number : The number of nodes present in this tree + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.nodes() + + def max_depth(self): + """ + Returns + --- + natural number : the length of the longest decision path in this tree. A single-node tree will return 1. + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.maximum_depth() + + def latex(self): + """ + Note + --- + This method doesn't work well for label headers that contain underscores due to underscore being a reserved character in LaTeX + + Returns + --- + string : A LaTeX string representing the model + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.latex() + + def json(self): + """ + Returns + --- + string : A JSON string representing the model + """ + if self.tree is None: + raise "Error: Model not yet trained" + return self.tree.json() + + def __python_train__(self, X, y): + """ + Parameters + --- + X : matrix-like, shape = [n_samples by m_features] + matrix containing the training samples and features + y : array-like, shape = [n_samples by 1] + column containing the correct label for each sample in X + + Modifies + --- + trains a model using the GOSDT pure Python implementation modified from OSDT + """ + + encoder = Encoder(X.values[:,:], header=X.columns[:], mode="complete", target=y[y.columns[0]]) + headers = encoder.headers + + X = pd.DataFrame(encoder.encode(X.values[:,:]), columns=encoder.headers) + y = y.reset_index(drop=True) + + # Translation of Variables: + # leaves_c := data representation of leaves using decision paths + # pred_c := data representation of predictions + # dic := leaf translator + # nleaves := number of leaves + # m := number of encoded features + # n := number of samples + # totaltime := total optimization run time (includes certification) + # time_c := time-to-optimality + # R_c := minimized risk + # COUNT := number of models evaluated + # C_c := number of models evaluated at optimality + # accu := accuracy + # best_is_cart := whether the optimal model is produced by cart + # clf := prediction model produced by cart + + start = time.perf_counter() + leaves_c, pred_c, dic, nleaves, m, n, totaltime, time_c, R_c, COUNT, C_c, accu, best_is_cart, clf = bbound( + X.values[:,:], y.values[:,-1], + self.configuration["objective"], self.configuration["regularization"], + prior_metric='curiosity', + w=self.configuration["w"], theta=self.configuration["theta"], + MAXDEPTH=float('Inf'), MAX_NLEAVES=float('Inf'), niter=float('Inf'), logon=False, + support=True, incre_support=True, accu_support=False, equiv_points=True, + lookahead=True, lenbound=True, R_c0 = 1, timelimit=self.configuration["time_limit"], init_cart = False, + saveTree = False, readTree = False) + + self.duration = time.perf_counter() - start + + if best_is_cart: + source = self.__translate_cart__(clf.tree_) + else: + decoded_leaves = [] + for leaf in leaves_c: + decoded_leaf = tuple((dic[j] if j > 0 else -dic[-j]) for j in leaf) + decoded_leaves.append(decoded_leaf) + source = self.__translate__(dict(zip(decoded_leaves, pred_c))) + self.tree = TreeClassifier(source, encoder=encoder) + self.tree.__initialize_training_loss__(X, y) + + def __translate__(self, leaves): + """ + Converts the leaves of OSDT into a TreeClassifier-compatible object + """ + if len(leaves) == 1: + return { + "complexity": self.configuration["regularization"], + "loss": 0, + "name": "class", + "prediction": list(leaves.values())[0] + } + else: + features = {} + for leaf in leaves.keys(): + if not leaf in features: + for e in leaf: + features[abs(e)] = 1 + else: + features[abs(e)] += 1 + split = None + max_freq = 0 + for feature, frequency in features.items(): + if frequency > max_freq: + max_freq = frequency + split = feature + positive_leaves = {} + negative_leaves = {} + for leaf, prediction in leaves.items(): + if split in leaf: + positive_leaves[tuple(s for s in leaf if s != split)] = prediction + else: + negative_leaves[tuple(s for s in leaf if s != -split)] = prediction + return { + "feature": split, + "name": "feature_" + str(split), + "reference": 1, + "relation": "==", + "true": self.__translate__(positive_leaves), + "false": self.__translate__(negative_leaves), + } + + def __translate_cart__(self, tree, id=0, depth=-1): + """ + Converts the CART results into a TreeClassifier-compatible object + """ + n_nodes = tree.node_count + children_left = tree.children_left + children_right = tree.children_right + feature = tree.feature + threshold = tree.threshold + + if tree.children_left[id] != children_right[id]: + return { + "feature": abs(tree.feature[id]), + "name": "feature_" + str(split), + "relation": "<=", + "reference": tree.threshold[id], + "true": self.__translate_cart__(tree, id=tree.children_left[id], depth=depth+1), + "false": self.__translate_cart__(tree, id=tree.children_right[id], depth=depth+1) + } + else: + return { + "complexity": self.configuration["regularization"], + "loss": 0, + "name": "class", + "prediction": 0 if len(tree.value[id][0]) == 1 or tree.value[id][0][0] >= tree.value[id][0][1] else 1 + } diff --git a/python/model/gosdt.py b/python/model/gosdt.py index 48bc4d7..2a58fa3 100644 --- a/python/model/gosdt.py +++ b/python/model/gosdt.py @@ -4,7 +4,11 @@ from numpy import array from sklearn.metrics import confusion_matrix, accuracy_score -import gosdt # Import the GOSDT extension +import libgosdt as gosdt # Import the GOSDT extension +from gosdt.model.encoder import Encoder +from gosdt.model.imbalance.osdt_imb_v9 import bbound, predict # Import the special objective implementation +from gosdt.model.tree_classifier import TreeClassifier # Import the tree classification model + from model.encoder import Encoder from .imbalance.osdt_imb_v9 import bbound, predict # Import the special objective implementation from .tree_classifier import TreeClassifier # Import the tree classification model @@ -13,10 +17,19 @@ class GOSDT: def __init__(self, configuration={}): self.configuration = configuration self.time = 0.0 + self.stime = 0.0 + self.utime = 0.0 + self.maxmem = 0 + self.numswap = 0 + self.numctxtswitch = 0 self.iterations = 0 self.size = 0 self.tree = None self.encoder = None + self.lb = 0 + self.ub = 0 + self.timeout = False + self.reported_loss = 0 def load(self, path): """ @@ -50,16 +63,42 @@ def __train__(self, X, y): gosdt.configure(json.dumps(self.configuration, separators=(',', ':'))) result = gosdt.fit(dataset.to_csv(index=False)) # Perform extension call to train the model - if gosdt.status() != 0: - raise "Error: GOSDT encountered an error while training" + self.time = gosdt.time() # Record the training time + self.stime = gosdt.stime() + self.utime = gosdt.utime() + + if gosdt.status() == 0: + print("gosdt reported successful execution") + self.timeout = False + elif gosdt.status() == 2: + print("gosdt reported possible timeout.") + self.timeout = True + self.time = -1 + self.stime = -1 + self.utime = -1 + else : + print('----------------------------------------------') + print(result) + print('----------------------------------------------') + raise Exception("Error: GOSDT encountered an error while training") + + result = json.loads(result) # Deserialize resu - result = json.loads(result) # Deserialize result self.tree = TreeClassifier(result[0]) # Parse the first result into model - - self.time = gosdt.time() # Record the training time self.iterations = gosdt.iterations() # Record the number of iterations self.size = gosdt.size() # Record the graph size required + self.maxmem = gosdt.maxmem() + self.numswap = gosdt.numswap() + self.numctxtswitch = gosdt.numctxtswitch() + + self.lb = gosdt.lower_bound() # Record reported global lower bound of algorithm + self.ub = gosdt.upper_bound() # Record reported global upper bound of algorithm + self.reported_loss = gosdt.model_loss() # Record reported training loss of returned tree + + print("training completed. {:.3f}/{:.3f}/{:.3f} (user, system, wall), mem={} MB".format(self.utime, self.stime, self.time, self.maxmem >> 10)) + print("bounds: [{:.6f}..{:.6f}] ({:.6f}) loss={:.6f}, iterations={}".format(self.lb, self.ub,self.ub - self.lb, self.reported_loss, self.iterations)) + def fit(self, X, y): """ Parameters @@ -91,7 +130,7 @@ def fit(self, X, y): elif self.configuration["objective"] == "pauc": self.configuration["w"] = None else: - raise "Error: GOSDT does not support this accuracy objective" + raise Exception("Error: GOSDT does not support this accuracy objective") self.__python_train__(X, y) else: self.__train__(X, y) @@ -109,13 +148,13 @@ def predict(self, X): array-like, shape = [n_sampels by 1] : a column where each element is the prediction associated with each row """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.predict(X) def error(self, X, y, weight=None): """ Parameters - --- + --- X : matrix-like, shape = [n_samples by m_features] an n-by-m matrix of sample and their features y : array-like, shape = [n_samples by 1] @@ -128,13 +167,13 @@ def error(self, X, y, weight=None): real number : the inaccuracy produced by applying this model overthe given dataset, with optionals for weighted inaccuracy """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.error(X, y, weight=weight) def score(self, X, y, weight=None): """ Parameters - --- + --- X : matrix-like, shape = [n_samples by m_features] an n-by-m matrix of sample and their features y : array-like, shape = [n_samples by 1] @@ -147,13 +186,13 @@ def score(self, X, y, weight=None): real number : the accuracy produced by applying this model overthe given dataset, with optionals for weighted accuracy """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.score(X, y, weight=weight) def confusion(self, X, y, weight=None): """ Parameters - --- + --- X : matrix-like, shape = [n_samples by m_features] an n-by-m matrix of sample and their features y : array-like, shape = [n_samples by 1] @@ -166,7 +205,7 @@ def confusion(self, X, y, weight=None): matrix-like, shape = [k_classes by k_classes] : the confusion matrix of all classes present in the dataset """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.confusion(self.predict(X), y, weight=weight) def __len__(self): @@ -176,7 +215,7 @@ def __len__(self): natural number : The number of terminal nodes present in this tree """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return len(self.tree) def leaves(self): @@ -186,7 +225,7 @@ def leaves(self): natural number : The number of terminal nodes present in this tree """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.leaves() def nodes(self): @@ -196,7 +235,7 @@ def nodes(self): natural number : The number of nodes present in this tree """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.nodes() def max_depth(self): @@ -206,7 +245,7 @@ def max_depth(self): natural number : the length of the longest decision path in this tree. A single-node tree will return 1. """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.maximum_depth() def latex(self): @@ -220,7 +259,7 @@ def latex(self): string : A LaTeX string representing the model """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.latex() def json(self): @@ -230,7 +269,7 @@ def json(self): string : A JSON string representing the model """ if self.tree is None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") return self.tree.json() def __python_train__(self, X, y): @@ -244,7 +283,7 @@ def __python_train__(self, X, y): Modifies --- - trains a model using the GOSDT pure Python implementation modified from OSDT + trains a model using the GOSDT pure Python implementation modified from OSDT """ encoder = Encoder(X.values[:,:], header=X.columns[:], mode="complete", target=y[y.columns[0]]) @@ -252,7 +291,7 @@ def __python_train__(self, X, y): X = pd.DataFrame(encoder.encode(X.values[:,:]), columns=encoder.headers) y = y.reset_index(drop=True) - + # Translation of Variables: # leaves_c := data representation of leaves using decision paths # pred_c := data representation of predictions @@ -271,10 +310,10 @@ def __python_train__(self, X, y): start = time.perf_counter() leaves_c, pred_c, dic, nleaves, m, n, totaltime, time_c, R_c, COUNT, C_c, accu, best_is_cart, clf = bbound( - X.values[:,:], y.values[:,-1], - self.configuration["objective"], self.configuration["regularization"], - prior_metric='curiosity', - w=self.configuration["w"], theta=self.configuration["theta"], + X.values[:,:], y.values[:,-1], + self.configuration["objective"], self.configuration["regularization"], + prior_metric='curiosity', + w=self.configuration["w"], theta=self.configuration["theta"], MAXDEPTH=float('Inf'), MAX_NLEAVES=float('Inf'), niter=float('Inf'), logon=False, support=True, incre_support=True, accu_support=False, equiv_points=True, lookahead=True, lenbound=True, R_c0 = 1, timelimit=self.configuration["time_limit"], init_cart = False, @@ -359,4 +398,4 @@ def __translate_cart__(self, tree, id=0, depth=-1): "loss": 0, "name": "class", "prediction": 0 if len(tree.value[id][0]) == 1 or tree.value[id][0][0] >= tree.value[id][0][1] else 1 - } \ No newline at end of file + } diff --git a/python/model/osdt.py b/python/model/osdt.py index c870fee..438240b 100644 --- a/python/model/osdt.py +++ b/python/model/osdt.py @@ -19,6 +19,9 @@ def __init__(self, configuration={}, preprocessor="complete"): self.configuration["output_limit"] = 1 self.preprocessor = preprocessor self.encoder = None + self.lb = -1 + self.ub = -1 + self.loss = -1 if not "objective" in self.configuration: self.configuration["objective"] = "acc" @@ -50,24 +53,25 @@ def train(self, X, y): # print("start") leaves, predictions, dictionary, number_of_leaves, m, n, time_to_certification, time_to_optimality, iterations, iterations_to_optimality, acc, profile =\ - bbound(X, y, lamb=lamb, prior_metric="curiosity", timelimit=self.configuration["time_limit"], init_cart=False) + bbound(X, y, lamb=lamb, prior_metric="curiosity", timelimit=self.configuration["time_limit"], init_cart=False) # print("end") - + # _, testaccu_OSDT = predict(leaves_c, prediction_c, dic, X_test, y_test) # leaves_c, pred_c, dic_c, nleaves_c, m_c, n_c, totaltime_c, time_c, R_c, \ # COUNT_c, C_c, accu_c, best_is_cart_c, clf_c, \ # len_queue, time_queue, time_realize_best_tree, R_best_tree, count_tree= \ - # bbound(X, y, - # self.configuration["objective"], self.configuration["regularization"], - # prior_metric='curiosity', - # w=self.configuration["w"], theta=self.configuration["theta"], + # bbound(X, y, + # self.configuration["objective"], self.configuration["regularization"], + # prior_metric='curiosity', + # w=self.configuration["w"], theta=self.configuration["theta"], # MAXDEPTH=float('Inf'), MAX_NLEAVES=float('Inf'), niter=float('Inf'), logon=False, # support=True, incre_support=True, accu_support=False, equiv_points=True, # lookahead=True, lenbound=True, R_c0 = 1, timelimit=self.configuration["time_limit"], init_cart = True, # saveTree = False, readTree = False) self.time = time.perf_counter() - start + self.utime = self.time if "profile_output" in self.configuration: po = open(self.configuration["profile_output"],"w") diff --git a/python/model/pygosdt_lib/models/objective.py b/python/model/pygosdt_lib/models/objective.py index 1bc52b3..1e63a4e 100644 --- a/python/model/pygosdt_lib/models/objective.py +++ b/python/model/pygosdt_lib/models/objective.py @@ -24,7 +24,7 @@ def evaluate(self, confusion): # elif self.name == 'partial_auc': # _, loss = self.pach_loss(leaves, theta) else: - raise "Unimplemented objective type '#{self.name}'" + raise Exception("Unimplemented objective type '#{self.name}'") # def leaf_predict(self, p, n, w=None): # predict = 1 diff --git a/python/model/pygosdt_lib/models/osdt_classifier.py b/python/model/pygosdt_lib/models/osdt_classifier.py index 713b6db..cc754a0 100644 --- a/python/model/pygosdt_lib/models/osdt_classifier.py +++ b/python/model/pygosdt_lib/models/osdt_classifier.py @@ -68,12 +68,12 @@ def fit(self, X, y): def predict(self, X_hat): if self.model == None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") predictions, accuracy = self.model(X_hat) return predictions def score(self, X_hat, y_hat): if self.model == None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") predictions, accuracy = self.model(X_hat, y_hat) return accuracy diff --git a/python/model/pygosdt_lib/models/parallel_osdt_classifier.py b/python/model/pygosdt_lib/models/parallel_osdt_classifier.py index 7aaac9a..01db764 100644 --- a/python/model/pygosdt_lib/models/parallel_osdt_classifier.py +++ b/python/model/pygosdt_lib/models/parallel_osdt_classifier.py @@ -56,7 +56,7 @@ def fit(self, X, y): # Make a prediction for the give unlablelled dataset def predict(self, X_hat): if self.model == None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") (n, m) = X_hat.shape predictions = array( [ [self.model.predict(X_hat[i,:])] for i in range(n) ] ) # predictions.reshape(n, 1) @@ -67,7 +67,7 @@ def predict(self, X_hat): # This would either compute training accuracy or testing accuracy def score(self, X_hat, y_hat): if self.model == None: - raise "Error: Model not yet trained" + raise Exception("Error: Model not yet trained") (n, m) = X_hat.shape predictions = self.predict(X_hat) diff --git a/python/model/tree_classifier.py b/python/model/tree_classifier.py index 0c238bf..84188b9 100644 --- a/python/model/tree_classifier.py +++ b/python/model/tree_classifier.py @@ -89,7 +89,7 @@ def __find_leaf__(self, sample): else: nodes.append(node["false"]) else: - raise "Unsupported relational operator {}".format(node["relation"]) + raise Exception("Unsupported relational operator {}".format(node["relation"])) def __all_leaves__(self): """ @@ -342,7 +342,7 @@ def __str__(self): else: predicates.append("{} != {}".format(name, str(list(domain["negative"])[0]))) else: - raise "Invalid Rule" + raise Exception("Invalid Rule") elif domain["type"] == "Numerical": predicate = name if domain["min"] != -float("INF"): @@ -451,7 +451,7 @@ def __groups__(self, node=None): elif condition_result == "false": rule["negative"].add(reference) else: - raise "OptimalSparseDecisionTree: Malformatted source {}".format(node) + raise Exception("OptimalSparseDecisionTree: Malformatted source {}".format(node)) elif node["relation"] == ">=": rule["type"] = "Numerical" if "max" not in rule: @@ -463,9 +463,9 @@ def __groups__(self, node=None): elif condition_result == "false": rule["max"] = min(reference, rule["max"]) else: - raise "OptimalSparseDecisionTree: Malformatted source {}".format(node) + raise Exception("OptimalSparseDecisionTree: Malformatted source {}".format(node)) else: - raise "Unsupported relational operator {}".format(node["relation"]) + raise Exception("Unsupported relational operator {}".format(node["relation"])) # Add the modified group to the group list groups.append(group) diff --git a/python/run.py b/python/run.py index 34cb628..24351fd 100644 --- a/python/run.py +++ b/python/run.py @@ -1,50 +1,70 @@ import sys -import os +import csv +import json -# from collectors.benchmark import benchmark -from collectors.configurations import configurations -from collectors.scale import samples, features -from plots.radar import plot_radar -from plots.tradeoff import plot_tradeoff -from plots.scalability import plot_scalability +from collectors.run_single_configuration import run_single_configuration if len(sys.argv) < 2: "No command specified." exit(1) command = sys.argv[1] -if command == "benchmark": - if len(sys.argv) < 3: - print("Usage python3 python/run.py benchmark [datasets,...] [algorithms,...]") - exit(1) - # benchmark(sys.argv[2].split(","), sys.argv[3].split(",")) -elif command == "radar": - if len(sys.argv) < 3: - print("Usage python3 python/run.py radar [datasets,...] [algorithms,...]") - exit(1) - for dataset in sys.argv[2].split(","): - plot_radar(dataset, sys.argv[3].split(",")) -elif command == "configurations": - if len(sys.argv) < 3: - print("Usage python3 python/run.py configurations [datasets,...] [algorithms,...]") + +if command == "csv" : + if len(sys.argv) < 4 : + print("Usage python3 python/run.py csv INDEX") exit(1) - configurations(sys.argv[2].split(","), sys.argv[3].split(",")) -elif command == "tradeoff": - if len(sys.argv) < 3: - print("Usage python3 python/run.py tradeoff [datasets,...] [algorithms,...]") + csvfilename = sys.argv[2] + index = int(sys.argv[3]) + + force = False + if len(sys.argv) == 5 : + force = (sys.argv[4] == 'rerun') + + try : + csvfile = open(csvfilename, newline='') + except : + print("Failed to open the CSV file: {}".format(csvfilename)) exit(1) - for dataset in sys.argv[2].split(","): - plot_tradeoff(dataset, sys.argv[3].split(",")) -elif command == "scale": - if len(sys.argv) < 3: - print("Usage python3 python/run.py scale [datasets,...] [algorithms,...]") + + configreader = csv.DictReader(csvfile, delimiter=',', quotechar='|') + + config = None + numconfigs = 0 + for i, x in enumerate(configreader): + numconfigs += 1 + if i == index : + config = x + break + + if config == None : + print("Configuration index {} out of range [0, {})".format(index, numconfigs)) exit(1) - for dataset in sys.argv[2].split(","): - samples(dataset, sys.argv[3].split(",")) - features(dataset, sys.argv[3].split(",")) -elif command == "scalability": + + # set the configuration index + config['idx'] = index + + run_single_configuration(config, force) + +if command == "test" : if len(sys.argv) < 3: - print("Usage python3 python/run.py scalability [datasets,...] [algorithms,...]") - exit(1) - for dataset in sys.argv[2].split(","): - plot_scalability(dataset, sys.argv[3].split(",")) + # change the config here + config = { + 'dataset': 'compas', + 'thresh': False, + 'lb_guess': True, + 'fold': 2, + 'algorithm': 'gosdt', + 'regularization': 0.001, + 'depth_budget': 4, + 'optimaldepth': 0, + 'time_limit': 1800, + 'max_depth': 3, + 'n_est': 20, + 'lb_max_depth': 3, + 'lb_n_est': 20, + 'cross_validate': False, } + else : + #pass some json as cmdline + config = json.loads(sys.argv[2]) + run_single_configuration(config, force=True) \ No newline at end of file diff --git a/python/visualizations/trace.py b/python/visualizations/trace.py index 1efe0d8..ddd1bf2 100644 --- a/python/visualizations/trace.py +++ b/python/visualizations/trace.py @@ -1,3 +1,4 @@ +import sys import matplotlib.pyplot as plt import networkx as nx import json @@ -125,12 +126,23 @@ def replot(i): plot.renderers.append(renderer) return plot +def render(attrname, old, new): + layout.children[1] = replot(iterator.value) + + +if len(sys.argv) == 1: + print("ERROR: Please specify directory path with trace files") + print("Usage: bokeh serve --show trace.py --args $directoryPath") + sys.exit() + +path = sys.argv[1] + i = 1 min_iter = 0 -while not file_exists("trace/{}.gml".format(min_iter)): +while not file_exists("{}/{}.gml".format(path, min_iter)): min_iter += 1 max_iter = min_iter -while file_exists("trace/{}.gml".format(max_iter + 1)): +while file_exists("{}/{}.gml".format(path, max_iter + 1)): max_iter += 1 doc = curdoc() @@ -138,11 +150,8 @@ def replot(i): iterator = Slider(title="Iteration", value=min_iter, start=min_iter, end=max_iter, step=1) layout = column(iterator, replot(iterator.value)) -def render(attrname, old, new): - layout.children[1] = replot(iterator.value) - # thread = Thread(target=background) # thread.start() iterator.on_change('value', render) -doc.add_root(layout) \ No newline at end of file +doc.add_root(layout) diff --git a/python/visualizations/tree.py b/python/visualizations/tree.py index b5a4a71..480ad36 100644 --- a/python/visualizations/tree.py +++ b/python/visualizations/tree.py @@ -1,3 +1,4 @@ +import sys import matplotlib.pyplot as plt import networkx as nx import json @@ -17,7 +18,7 @@ from functools import partial from time import sleep -points = pd.DataFrame(pd.read_csv("datasets/gaussian/hundred.csv", delimiter=",")) +# points = pd.DataFrame(pd.read_csv("dataset/data.csv", delimiter=",")) def file_exists(path): exists = False @@ -33,10 +34,10 @@ def file_exists(path): file_descriptor.close() return exists -def replot(i): +def replot(i, path): # colormap = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'} # colors = [colormap[x] for x in flowers['species']] - path = "tree/{}.gml".format(i) + path = "{}/{}.gml".format(path, i) with open(path) as json_file: data = json.load(json_file) G = nx.readwrite.json_graph.node_link_graph(data) @@ -175,27 +176,35 @@ def replot(i): # plot.renderers.extend(spans) - plot.circle(points.values[:,0], points.values[:,1], legend_label='ground truth', fill_alpha=0.2, size=3) + #plot.circle(points.values[:,0], points.values[:,1], legend_label='ground truth', fill_alpha=0.2, size=3) # plot.legend.location = "top_left" # plot.legend.click_policy = "hide" return plot + +if len(sys.argv) == 1: + print("ERROR: Please specify directory path with tree files") + print("Usage: bokeh serve --show tree.py --args $directoryPath") + sys.exit() + +path = sys.argv[1] + i = 1 min_iter = 0 -while not file_exists("tree/{}.gml".format(min_iter)): +while not file_exists("{}/{}.gml".format(path, min_iter)): min_iter += 1 max_iter = min_iter -while file_exists("tree/{}.gml".format(max_iter + 1)): +while file_exists("{}/{}.gml".format(path, max_iter + 1)): max_iter += 1 doc = curdoc() iterator = Slider(title="Iteration", value=min_iter, start=min_iter, end=max_iter, step=1) -layout = column(iterator, replot(iterator.value)) +layout = column(iterator, replot(iterator.value, path)) def render(attrname, old, new): - layout.children[1] = replot(iterator.value) + layout.children[1] = replot(iterator.value, path) # thread = Thread(target=background) # thread.start()