From 15b49aba77650c983fd8b7aacc55adc2450f2645 Mon Sep 17 00:00:00 2001 From: Omri Kaduri Date: Mon, 20 Jul 2020 00:33:41 +0300 Subject: [PATCH 1/2] Made `Classifier` an sklearn estimator In order to be able to use sklearn `OneVsRestClassifier` (for multi-class tasks which are not currently supported by this repository), i've changed the `Classifier` class to be an sklearn estimator (by using inheritance from `BaseEstimator` and `ClassifierMixin`). Also, I've added `predict_proba` function. --- PyFastBDT/FastBDT.py | 55 +++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/PyFastBDT/FastBDT.py b/PyFastBDT/FastBDT.py index 77e2cff..a851b86 100644 --- a/PyFastBDT/FastBDT.py +++ b/PyFastBDT/FastBDT.py @@ -5,12 +5,15 @@ import ctypes import ctypes.util + +from sklearn.base import BaseEstimator, ClassifierMixin + c_double_p = ctypes.POINTER(ctypes.c_double) c_float_p = ctypes.POINTER(ctypes.c_float) c_bool_p = ctypes.POINTER(ctypes.c_bool) c_uint_p = ctypes.POINTER(ctypes.c_uint) - -FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__),'libFastBDT_CInterface.so')) +print(os.path.join(os.path.dirname(__file__))) +FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__), 'libFastBDT_CInterface.so')) FastBDT_library.Create.restype = ctypes.c_void_p FastBDT_library.Delete.argtypes = [ctypes.c_void_p] @@ -60,7 +63,6 @@ FastBDT_library.GetSPlot.argtypes = [ctypes.c_void_p] FastBDT_library.GetSPlot.restypes = ctypes.c_bool - FastBDT_library.GetVariableRanking.argtypes = [ctypes.c_void_p] FastBDT_library.GetVariableRanking.restype = ctypes.c_void_p FastBDT_library.DeleteVariableRanking.argtypes = [ctypes.c_void_p] @@ -68,7 +70,7 @@ FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.restype = ctypes.c_uint FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.argtypes = [ctypes.c_void_p, ctypes.c_uint] FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.restype = ctypes.c_double - + FastBDT_library.GetIndividualVariableRanking.argtypes = [ctypes.c_void_p, c_float_p] FastBDT_library.GetIndividualVariableRanking.restype = ctypes.c_void_p @@ -86,8 +88,8 @@ def calculate_roc_auc(p, t, w=None): if w is None: w = np.ones(len(t)) N = w.sum() - T = np.sum(t*w) - t = t*w + T = np.sum(t * w) + t = t * w index = np.argsort(p) efficiency = (T - np.cumsum(t[index])) / float(T) purity = (T - np.cumsum(t[index])) / (N - np.cumsum(w)) @@ -95,8 +97,9 @@ def calculate_roc_auc(p, t, w=None): return np.abs(np.trapz(purity, efficiency)) -class Classifier(object): - def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True, purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0): +class Classifier(BaseEstimator, ClassifierMixin): + def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True, + purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0): """ @param binning list of numbers with the power N used for each feature binning e.g. 8 means 2^8 bins @param nTrees number of trees @@ -131,7 +134,8 @@ def create_forest(self): FastBDT_library.SetFlatnessLoss(forest, float(self.flatnessLoss)) FastBDT_library.SetTransform2Probability(forest, bool(self.transform2probability)) FastBDT_library.SetSPlot(forest, bool(self.sPlot)) - FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p), int(len(self.purityTransformation))) + FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p), + int(len(self.purityTransformation))) return forest def fit(self, X, y, weights=None): @@ -141,17 +145,30 @@ def fit(self, X, y, weights=None): w_temp = np.require(weights, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) numberOfEvents, numberOfFeatures = X_temp.shape FastBDT_library.Fit(self.forest, X_temp.ctypes.data_as(c_float_p), - w_temp.ctypes.data_as(c_float_p) if weights is not None else None, - y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures)) + w_temp.ctypes.data_as(c_float_p) if weights is not None else None, + y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures)) return self def predict(self, X): X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) N = len(X) p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O']) - FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), int(X_temp.shape[0])) + FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), + int(X_temp.shape[0])) return p - + + def predict_proba(self, X): + """ + In-order to be able to compose this classifier inside a OneVsRestClassifier (sklearn), + it is needed to return the probabilities for each prediction. + """ + X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) + N = len(X) + p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O']) + FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), + int(X_temp.shape[0])) + return np.stack((p, 1 - p), axis=1) + def predict_single(self, row): return FastBDT_library.Predict(self.forest, row.ctypes.data_as(c_float_p)) @@ -160,7 +177,7 @@ def save(self, weightfile): def load(self, weightfile): FastBDT_library.Load(self.forest, bytes(weightfile, 'utf-8')) - + def individualFeatureImportance(self, X): X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) _ranking = FastBDT_library.GetIndividualVariableRanking(self.forest, X_temp.ctypes.data_as(c_float_p)) @@ -188,7 +205,8 @@ def externFeatureImportance(self, X, y, weights=None, X_test=None, y_test=None, numberOfEvents, numberOfFeatures = X.shape global_auc = calculate_roc_auc(self.predict(X_test), y_test, weights_test) forest = self.forest - importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test, y_test, weights_test) + importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test, + y_test, weights_test) self.forest = forest return importances @@ -203,14 +221,15 @@ def _externFeatureImportance(self, features, global_auc, X, y, weights, X_test, auc = calculate_roc_auc(self.predict(X_test_temp), y_test, weights_test) FastBDT_library.Delete(self.forest) importances[i] = global_auc - auc - + most_important = max(importances.keys(), key=lambda x: importances[x]) remaining_features = [v for v in features if v != most_important] if len(remaining_features) == 1: return importances - + importances = {most_important: importances[most_important]} - rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y, weights, X_test, y_test, weights_test) + rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y, + weights, X_test, y_test, weights_test) importances.update(rest) return importances From 75108d294716bb84974531ebb46bef251e4f1c8e Mon Sep 17 00:00:00 2001 From: Omri Kaduri Date: Mon, 20 Jul 2020 00:45:41 +0300 Subject: [PATCH 2/2] Some cleaning --- PyFastBDT/FastBDT.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/PyFastBDT/FastBDT.py b/PyFastBDT/FastBDT.py index a851b86..0830e53 100644 --- a/PyFastBDT/FastBDT.py +++ b/PyFastBDT/FastBDT.py @@ -12,7 +12,7 @@ c_float_p = ctypes.POINTER(ctypes.c_float) c_bool_p = ctypes.POINTER(ctypes.c_bool) c_uint_p = ctypes.POINTER(ctypes.c_uint) -print(os.path.join(os.path.dirname(__file__))) + FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__), 'libFastBDT_CInterface.so')) FastBDT_library.Create.restype = ctypes.c_void_p @@ -162,11 +162,7 @@ def predict_proba(self, X): In-order to be able to compose this classifier inside a OneVsRestClassifier (sklearn), it is needed to return the probabilities for each prediction. """ - X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O']) - N = len(X) - p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O']) - FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), - int(X_temp.shape[0])) + p = self.predict(X) return np.stack((p, 1 - p), axis=1) def predict_single(self, row):