From 15b49aba77650c983fd8b7aacc55adc2450f2645 Mon Sep 17 00:00:00 2001
From: Omri Kaduri <kaduriomri@gmail.com>
Date: Mon, 20 Jul 2020 00:33:41 +0300
Subject: [PATCH 1/2] Made `Classifier` an sklearn estimator

In order to be able to use sklearn `OneVsRestClassifier` (for
multi-class tasks which are not currently supported by this repository),
i've changed the `Classifier` class to be an sklearn estimator (by using
inheritance from `BaseEstimator` and `ClassifierMixin`).

Also, I've added `predict_proba` function.
---
 PyFastBDT/FastBDT.py | 55 +++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/PyFastBDT/FastBDT.py b/PyFastBDT/FastBDT.py
index 77e2cff..a851b86 100644
--- a/PyFastBDT/FastBDT.py
+++ b/PyFastBDT/FastBDT.py
@@ -5,12 +5,15 @@
 
 import ctypes
 import ctypes.util
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+
 c_double_p = ctypes.POINTER(ctypes.c_double)
 c_float_p = ctypes.POINTER(ctypes.c_float)
 c_bool_p = ctypes.POINTER(ctypes.c_bool)
 c_uint_p = ctypes.POINTER(ctypes.c_uint)
-
-FastBDT_library =  ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__),'libFastBDT_CInterface.so'))
+print(os.path.join(os.path.dirname(__file__)))
+FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__), 'libFastBDT_CInterface.so'))
 
 FastBDT_library.Create.restype = ctypes.c_void_p
 FastBDT_library.Delete.argtypes = [ctypes.c_void_p]
@@ -60,7 +63,6 @@
 FastBDT_library.GetSPlot.argtypes = [ctypes.c_void_p]
 FastBDT_library.GetSPlot.restypes = ctypes.c_bool
 
-
 FastBDT_library.GetVariableRanking.argtypes = [ctypes.c_void_p]
 FastBDT_library.GetVariableRanking.restype = ctypes.c_void_p
 FastBDT_library.DeleteVariableRanking.argtypes = [ctypes.c_void_p]
@@ -68,7 +70,7 @@
 FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.restype = ctypes.c_uint
 FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.argtypes = [ctypes.c_void_p, ctypes.c_uint]
 FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.restype = ctypes.c_double
-    
+
 FastBDT_library.GetIndividualVariableRanking.argtypes = [ctypes.c_void_p, c_float_p]
 FastBDT_library.GetIndividualVariableRanking.restype = ctypes.c_void_p
 
@@ -86,8 +88,8 @@ def calculate_roc_auc(p, t, w=None):
     if w is None:
         w = np.ones(len(t))
     N = w.sum()
-    T = np.sum(t*w)
-    t = t*w
+    T = np.sum(t * w)
+    t = t * w
     index = np.argsort(p)
     efficiency = (T - np.cumsum(t[index])) / float(T)
     purity = (T - np.cumsum(t[index])) / (N - np.cumsum(w))
@@ -95,8 +97,9 @@ def calculate_roc_auc(p, t, w=None):
     return np.abs(np.trapz(purity, efficiency))
 
 
-class Classifier(object):
-    def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True, purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0):
+class Classifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True,
+                 purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0):
         """
         @param binning list of numbers with the power N used for each feature binning e.g. 8 means 2^8 bins
         @param nTrees number of trees
@@ -131,7 +134,8 @@ def create_forest(self):
         FastBDT_library.SetFlatnessLoss(forest, float(self.flatnessLoss))
         FastBDT_library.SetTransform2Probability(forest, bool(self.transform2probability))
         FastBDT_library.SetSPlot(forest, bool(self.sPlot))
-        FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p), int(len(self.purityTransformation)))
+        FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p),
+                                                int(len(self.purityTransformation)))
         return forest
 
     def fit(self, X, y, weights=None):
@@ -141,17 +145,30 @@ def fit(self, X, y, weights=None):
             w_temp = np.require(weights, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
         numberOfEvents, numberOfFeatures = X_temp.shape
         FastBDT_library.Fit(self.forest, X_temp.ctypes.data_as(c_float_p),
-                              w_temp.ctypes.data_as(c_float_p) if weights is not None else None,
-                              y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures))
+                            w_temp.ctypes.data_as(c_float_p) if weights is not None else None,
+                            y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures))
         return self
 
     def predict(self, X):
         X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
         N = len(X)
         p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
-        FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), int(X_temp.shape[0]))
+        FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p),
+                                     int(X_temp.shape[0]))
         return p
-    
+
+    def predict_proba(self, X):
+        """
+        In-order to be able to compose this classifier inside a OneVsRestClassifier (sklearn),
+        it is needed to return the probabilities for each prediction.
+        """
+        X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
+        N = len(X)
+        p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
+        FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p),
+                                     int(X_temp.shape[0]))
+        return np.stack((p, 1 - p), axis=1)
+
     def predict_single(self, row):
         return FastBDT_library.Predict(self.forest, row.ctypes.data_as(c_float_p))
 
@@ -160,7 +177,7 @@ def save(self, weightfile):
 
     def load(self, weightfile):
         FastBDT_library.Load(self.forest, bytes(weightfile, 'utf-8'))
-    
+
     def individualFeatureImportance(self, X):
         X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
         _ranking = FastBDT_library.GetIndividualVariableRanking(self.forest, X_temp.ctypes.data_as(c_float_p))
@@ -188,7 +205,8 @@ def externFeatureImportance(self, X, y, weights=None, X_test=None, y_test=None,
         numberOfEvents, numberOfFeatures = X.shape
         global_auc = calculate_roc_auc(self.predict(X_test), y_test, weights_test)
         forest = self.forest
-        importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test, y_test, weights_test)
+        importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test,
+                                                    y_test, weights_test)
         self.forest = forest
         return importances
 
@@ -203,14 +221,15 @@ def _externFeatureImportance(self, features, global_auc, X, y, weights, X_test,
             auc = calculate_roc_auc(self.predict(X_test_temp), y_test, weights_test)
             FastBDT_library.Delete(self.forest)
             importances[i] = global_auc - auc
-   
+
         most_important = max(importances.keys(), key=lambda x: importances[x])
         remaining_features = [v for v in features if v != most_important]
         if len(remaining_features) == 1:
             return importances
-    
+
         importances = {most_important: importances[most_important]}
-        rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y, weights, X_test, y_test, weights_test)
+        rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y,
+                                             weights, X_test, y_test, weights_test)
         importances.update(rest)
         return importances
 

From 75108d294716bb84974531ebb46bef251e4f1c8e Mon Sep 17 00:00:00 2001
From: Omri Kaduri <kaduriomri@gmail.com>
Date: Mon, 20 Jul 2020 00:45:41 +0300
Subject: [PATCH 2/2] Some cleaning

---
 PyFastBDT/FastBDT.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/PyFastBDT/FastBDT.py b/PyFastBDT/FastBDT.py
index a851b86..0830e53 100644
--- a/PyFastBDT/FastBDT.py
+++ b/PyFastBDT/FastBDT.py
@@ -12,7 +12,7 @@
 c_float_p = ctypes.POINTER(ctypes.c_float)
 c_bool_p = ctypes.POINTER(ctypes.c_bool)
 c_uint_p = ctypes.POINTER(ctypes.c_uint)
-print(os.path.join(os.path.dirname(__file__)))
+
 FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__), 'libFastBDT_CInterface.so'))
 
 FastBDT_library.Create.restype = ctypes.c_void_p
@@ -162,11 +162,7 @@ def predict_proba(self, X):
         In-order to be able to compose this classifier inside a OneVsRestClassifier (sklearn),
         it is needed to return the probabilities for each prediction.
         """
-        X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
-        N = len(X)
-        p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
-        FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p),
-                                     int(X_temp.shape[0]))
+        p = self.predict(X)
         return np.stack((p, 1 - p), axis=1)
 
     def predict_single(self, row):