diff --git a/problem.py b/problem.py index 01b58d1..a78e2dc 100644 --- a/problem.py +++ b/problem.py @@ -95,9 +95,12 @@ def _get_data(path=".", split="train"): # Load data from csv files into pd.DataFrame # - # returns X (input) and y (output) arrays + # returns X_df (input) DataFrame with "groups" column (categorical) + # y (output) np.array - data = pd.read_csv(os.path.join(path, "data", split + ".csv")) + data_df = pd.read_csv(os.path.join(path, "data", split + ".csv")) + data_df["SampleID"] = data_df["SampleID"].astype("category") + SampleID = np.array(data_df["SampleID"].cat.codes) # Retrieve the geochemical data for X. # FeO, Fe2O3 and FeO2O3T are dropped because FeOT @@ -145,14 +148,16 @@ def _get_data(path=".", split="train"): "U", ] - X_majors = data.loc[:, majors] - X_traces = data.loc[:, traces] + X_majors = data_df.loc[:, majors] + X_traces = data_df.loc[:, traces] X_df = pd.concat([X_majors, X_traces], axis=1) - X = X_df.to_numpy() + # if split == "train": + X_df["groups"] = SampleID.tolist() + X = X_df # labels - y = np.array(data["Event"].map(cat_to_int).fillna(-1).astype("int8")) + y = np.array(data_df["Event"].map(cat_to_int).fillna(-1).astype("int8")) return X, y diff --git a/submissions/starting_kit/classifier.py b/submissions/starting_kit/classifier.py index 85bc36f..f95740c 100644 --- a/submissions/starting_kit/classifier.py +++ b/submissions/starting_kit/classifier.py @@ -17,10 +17,13 @@ def __init__(self): self.pipe = make_pipeline(self.transformer, self.model) def fit(self, X, y): + X = X.drop(["groups"], axis=1) self.pipe.fit(X, y) def predict(self, X): + X = X.drop(["groups"], axis=1) return self.pipe.predict(X) def predict_proba(self, X): + X = X.drop(["groups"], axis=1) return self.pipe.predict_proba(X)