diff --git a/problem.py b/problem.py
index 01b58d1..a78e2dc 100644
--- a/problem.py
+++ b/problem.py
@@ -95,9 +95,12 @@
 def _get_data(path=".", split="train"):
     # Load data from csv files into pd.DataFrame
     #
-    # returns X (input) and y (output) arrays
+    # returns X_df (input) DataFrame with "groups" column (categorical)
+    #         y (output) np.array
 
-    data = pd.read_csv(os.path.join(path, "data", split + ".csv"))
+    data_df = pd.read_csv(os.path.join(path, "data", split + ".csv"))
+    data_df["SampleID"] = data_df["SampleID"].astype("category")
+    SampleID = np.array(data_df["SampleID"].cat.codes)
 
     # Retrieve the geochemical data for X.
     # FeO, Fe2O3 and FeO2O3T are dropped because FeOT
@@ -145,14 +148,16 @@ def _get_data(path=".", split="train"):
         "U",
     ]
 
-    X_majors = data.loc[:, majors]
-    X_traces = data.loc[:, traces]
+    X_majors = data_df.loc[:, majors]
+    X_traces = data_df.loc[:, traces]
     X_df = pd.concat([X_majors, X_traces], axis=1)
 
-    X = X_df.to_numpy()
+    # if split == "train":
+    X_df["groups"] = SampleID.tolist()
+    X = X_df
 
     # labels
-    y = np.array(data["Event"].map(cat_to_int).fillna(-1).astype("int8"))
+    y = np.array(data_df["Event"].map(cat_to_int).fillna(-1).astype("int8"))
 
     return X, y
 
diff --git a/submissions/starting_kit/classifier.py b/submissions/starting_kit/classifier.py
index 85bc36f..f95740c 100644
--- a/submissions/starting_kit/classifier.py
+++ b/submissions/starting_kit/classifier.py
@@ -17,10 +17,13 @@ def __init__(self):
         self.pipe = make_pipeline(self.transformer, self.model)
 
     def fit(self, X, y):
+        X = X.drop(["groups"], axis=1)
         self.pipe.fit(X, y)
 
     def predict(self, X):
+        X = X.drop(["groups"], axis=1)
         return self.pipe.predict(X)
 
     def predict_proba(self, X):
+        X = X.drop(["groups"], axis=1)
         return self.pipe.predict_proba(X)