Skip to content

Commit

Permalink
Support non-native endian inputs to LabelEncoder (#6384)
Browse files Browse the repository at this point in the history
`numpy`/`pandas`/`cupy` support wrapping data with non-native endianness. `cudf` currently errors when wrapping such arrays coming from `numpy`/`cupy` (but not from pandas). This PR fixes `LabelEncoder` to cast to native endianness when receiving an numpy or cupy input, and adds a corresponding test. I can confirm that this fixes the hypothesis test failures we're seeing in nightlies currently.

Supersedes #6379.

Authors:
  - Jim Crist-Harif (https://github.com/jcrist)

Approvers:
  - Simon Adorf (https://github.com/csadorf)

URL: #6384
  • Loading branch information
jcrist authored Mar 5, 2025
1 parent 5beb49b commit 14b66c7
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 0 deletions.
3 changes: 3 additions & 0 deletions python/cuml/cuml/preprocessing/LabelEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def _to_cudf_series(y, **kwargs):
elif isinstance(y, (np.ndarray, cp.ndarray)):
if y.ndim == 2 and y.shape[-1] == 1:
y = y.flatten()
if not y.dtype.isnative:
# cudf doesn't support byte-swapped arrays as inputs, coerce to native
y = y.astype(y.dtype.newbyteorder("="))
if getattr(y, "dtype", None) == "float16":
# Upcast float16 since cudf cannot handle them yet
y = y.astype("float32")
Expand Down
18 changes: 18 additions & 0 deletions python/cuml/cuml/tests/test_label_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,24 @@ def test_labelencoder_fit_transform_input_types(length, cardinality, kind):
cudf.testing.assert_index_equal(encoder.classes_, sol.cat.categories)


@pytest.mark.parametrize("kind", ["cupy", "numpy", "pandas"])
def test_labelencoder_fit_transform_byteswapped(kind):
dtype = np.dtype("i4").newbyteorder()
native = np.array([1, 2, 1, 3, 2, 1], dtype="i4")
x = native.astype(dtype)
if kind == "cupy":
x = cp.array(x)
elif kind == "pandas":
x = pd.Series(x)

encoder = LabelEncoder()
res = encoder.fit_transform(x)
sol = cudf.Series(native).astype("category")

cudf.testing.assert_series_equal(res, sol.cat.codes)
cudf.testing.assert_index_equal(encoder.classes_, sol.cat.categories)


@pytest.mark.parametrize("use_fit_transform", [False, True])
@pytest.mark.parametrize(
"orig_label, ord_label, expected_reverted, bad_ord_label",
Expand Down

0 comments on commit 14b66c7

Please sign in to comment.