Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored Classifier to an sklearn estimator #12

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 32 additions & 17 deletions PyFastBDT/FastBDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@

import ctypes
import ctypes.util

from sklearn.base import BaseEstimator, ClassifierMixin

c_double_p = ctypes.POINTER(ctypes.c_double)
c_float_p = ctypes.POINTER(ctypes.c_float)
c_bool_p = ctypes.POINTER(ctypes.c_bool)
c_uint_p = ctypes.POINTER(ctypes.c_uint)

FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__),'libFastBDT_CInterface.so'))
FastBDT_library = ctypes.cdll.LoadLibrary(os.path.join(os.path.dirname(__file__), 'libFastBDT_CInterface.so'))

FastBDT_library.Create.restype = ctypes.c_void_p
FastBDT_library.Delete.argtypes = [ctypes.c_void_p]
Expand Down Expand Up @@ -60,15 +63,14 @@
FastBDT_library.GetSPlot.argtypes = [ctypes.c_void_p]
FastBDT_library.GetSPlot.restypes = ctypes.c_bool


FastBDT_library.GetVariableRanking.argtypes = [ctypes.c_void_p]
FastBDT_library.GetVariableRanking.restype = ctypes.c_void_p
FastBDT_library.DeleteVariableRanking.argtypes = [ctypes.c_void_p]
FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.argtypes = [ctypes.c_void_p]
FastBDT_library.ExtractNumberOfVariablesFromVariableRanking.restype = ctypes.c_uint
FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.argtypes = [ctypes.c_void_p, ctypes.c_uint]
FastBDT_library.ExtractImportanceOfVariableFromVariableRanking.restype = ctypes.c_double

FastBDT_library.GetIndividualVariableRanking.argtypes = [ctypes.c_void_p, c_float_p]
FastBDT_library.GetIndividualVariableRanking.restype = ctypes.c_void_p

Expand All @@ -86,17 +88,18 @@ def calculate_roc_auc(p, t, w=None):
if w is None:
w = np.ones(len(t))
N = w.sum()
T = np.sum(t*w)
t = t*w
T = np.sum(t * w)
t = t * w
index = np.argsort(p)
efficiency = (T - np.cumsum(t[index])) / float(T)
purity = (T - np.cumsum(t[index])) / (N - np.cumsum(w))
purity = np.where(np.isnan(purity), 0, purity)
return np.abs(np.trapz(purity, efficiency))


class Classifier(object):
def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True, purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0):
class Classifier(BaseEstimator, ClassifierMixin):
def __init__(self, binning=[], nTrees=100, depth=3, shrinkage=0.1, subsample=0.5, transform2probability=True,
purityTransformation=[], sPlot=False, flatnessLoss=-1.0, numberOfFlatnessFeatures=0):
"""
@param binning list of numbers with the power N used for each feature binning e.g. 8 means 2^8 bins
@param nTrees number of trees
Expand Down Expand Up @@ -131,7 +134,8 @@ def create_forest(self):
FastBDT_library.SetFlatnessLoss(forest, float(self.flatnessLoss))
FastBDT_library.SetTransform2Probability(forest, bool(self.transform2probability))
FastBDT_library.SetSPlot(forest, bool(self.sPlot))
FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p), int(len(self.purityTransformation)))
FastBDT_library.SetPurityTransformation(forest, np.array(self.purityTransformation).ctypes.data_as(c_uint_p),
int(len(self.purityTransformation)))
return forest

def fit(self, X, y, weights=None):
Expand All @@ -141,17 +145,26 @@ def fit(self, X, y, weights=None):
w_temp = np.require(weights, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
numberOfEvents, numberOfFeatures = X_temp.shape
FastBDT_library.Fit(self.forest, X_temp.ctypes.data_as(c_float_p),
w_temp.ctypes.data_as(c_float_p) if weights is not None else None,
y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures))
w_temp.ctypes.data_as(c_float_p) if weights is not None else None,
y_temp.ctypes.data_as(c_bool_p), int(numberOfEvents), int(numberOfFeatures))
return self

def predict(self, X):
X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
N = len(X)
p = np.require(np.zeros(N), dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p), int(X_temp.shape[0]))
FastBDT_library.PredictArray(self.forest, X_temp.ctypes.data_as(c_float_p), p.ctypes.data_as(c_float_p),
int(X_temp.shape[0]))
return p


def predict_proba(self, X):
"""
In-order to be able to compose this classifier inside a OneVsRestClassifier (sklearn),
it is needed to return the probabilities for each prediction.
"""
p = self.predict(X)
return np.stack((p, 1 - p), axis=1)

def predict_single(self, row):
return FastBDT_library.Predict(self.forest, row.ctypes.data_as(c_float_p))

Expand All @@ -160,7 +173,7 @@ def save(self, weightfile):

def load(self, weightfile):
FastBDT_library.Load(self.forest, bytes(weightfile, 'utf-8'))

def individualFeatureImportance(self, X):
X_temp = np.require(X, dtype=np.float32, requirements=['A', 'W', 'C', 'O'])
_ranking = FastBDT_library.GetIndividualVariableRanking(self.forest, X_temp.ctypes.data_as(c_float_p))
Expand Down Expand Up @@ -188,7 +201,8 @@ def externFeatureImportance(self, X, y, weights=None, X_test=None, y_test=None,
numberOfEvents, numberOfFeatures = X.shape
global_auc = calculate_roc_auc(self.predict(X_test), y_test, weights_test)
forest = self.forest
importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test, y_test, weights_test)
importances = self._externFeatureImportance(list(range(numberOfFeatures)), global_auc, X, y, weights, X_test,
y_test, weights_test)
self.forest = forest
return importances

Expand All @@ -203,14 +217,15 @@ def _externFeatureImportance(self, features, global_auc, X, y, weights, X_test,
auc = calculate_roc_auc(self.predict(X_test_temp), y_test, weights_test)
FastBDT_library.Delete(self.forest)
importances[i] = global_auc - auc

most_important = max(importances.keys(), key=lambda x: importances[x])
remaining_features = [v for v in features if v != most_important]
if len(remaining_features) == 1:
return importances

importances = {most_important: importances[most_important]}
rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y, weights, X_test, y_test, weights_test)
rest = self._externFeatureImportance(remaining_features, global_auc - importances[most_important], X, y,
weights, X_test, y_test, weights_test)
importances.update(rest)
return importances

Expand Down