Support non-native endian inputs to LabelEncoder (#6384)

`numpy`/`pandas`/`cupy` support wrapping data with non-native endianness. `cudf` currently errors when wrapping such arrays coming from `numpy`/`cupy` (but not from pandas). This PR fixes `LabelEncoder` to cast to native endianness when receiving an numpy or cupy input, and adds a corresponding test. I can confirm that this fixes the hypothesis test failures we're seeing in nightlies currently. Supersedes #6379. Authors: - Jim Crist-Harif (https://github.com/jcrist) Approvers: - Simon Adorf (https://github.com/csadorf) URL: #6384
rapidsai · Mar 5, 2025 · 14b66c7 · 14b66c7
1 parent 5beb49b
commit 14b66c7
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 0 deletions.
diff --git a/python/cuml/cuml/preprocessing/LabelEncoder.py b/python/cuml/cuml/preprocessing/LabelEncoder.py
@@ -44,6 +44,9 @@ def _to_cudf_series(y, **kwargs):
     elif isinstance(y, (np.ndarray, cp.ndarray)):
         if y.ndim == 2 and y.shape[-1] == 1:
             y = y.flatten()
+        if not y.dtype.isnative:
+            # cudf doesn't support byte-swapped arrays as inputs, coerce to native
+            y = y.astype(y.dtype.newbyteorder("="))
     if getattr(y, "dtype", None) == "float16":
         # Upcast float16 since cudf cannot handle them yet
         y = y.astype("float32")

diff --git a/python/cuml/cuml/tests/test_label_encoder.py b/python/cuml/cuml/tests/test_label_encoder.py
@@ -226,6 +226,24 @@ def test_labelencoder_fit_transform_input_types(length, cardinality, kind):
     cudf.testing.assert_index_equal(encoder.classes_, sol.cat.categories)
 
 
+@pytest.mark.parametrize("kind", ["cupy", "numpy", "pandas"])
+def test_labelencoder_fit_transform_byteswapped(kind):
+    dtype = np.dtype("i4").newbyteorder()
+    native = np.array([1, 2, 1, 3, 2, 1], dtype="i4")
+    x = native.astype(dtype)
+    if kind == "cupy":
+        x = cp.array(x)
+    elif kind == "pandas":
+        x = pd.Series(x)
+
+    encoder = LabelEncoder()
+    res = encoder.fit_transform(x)
+    sol = cudf.Series(native).astype("category")
+
+    cudf.testing.assert_series_equal(res, sol.cat.codes)
+    cudf.testing.assert_index_equal(encoder.classes_, sol.cat.categories)
+
+
 @pytest.mark.parametrize("use_fit_transform", [False, True])
 @pytest.mark.parametrize(
     "orig_label, ord_label, expected_reverted, bad_ord_label",