diff --git a/lib/scholar/options.ex b/lib/scholar/options.ex index e6e95590..1bb28c33 100644 --- a/lib/scholar/options.ex +++ b/lib/scholar/options.ex @@ -108,4 +108,17 @@ defmodule Scholar.Options do {:error, "expected 'beta' to be in the range [0, inf]"} end end + + def quantile_range(value) do + case value do + {q_min, q_max} + when is_number(q_min) and is_number(q_max) and 0.0 < q_min and q_min < q_max and + q_max < 100.0 -> + {:ok, {q_min, q_max}} + + _ -> + {:error, + "expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: #{inspect(value)}"} + end + end end diff --git a/lib/scholar/preprocessing/robust_scaler.ex b/lib/scholar/preprocessing/robust_scaler.ex new file mode 100644 index 00000000..0eed60ae --- /dev/null +++ b/lib/scholar/preprocessing/robust_scaler.ex @@ -0,0 +1,149 @@ +defmodule Scholar.Preprocessing.RobustScaler do + @moduledoc ~S""" + Scale features using statistics that are robust to outliers. + + This Scaler removes the median and scales the data according to + the quantile range (defaults to IQR: Interquartile Range). + The IQR is the range between the 1st quartile (25th quantile) + and the 3rd quartile (75th quantile). + """ + + import Nx.Defn + + @derive {Nx.Container, containers: [:medians, :iqr]} + defstruct [:medians, :iqr] + + opts_schema = [ + quantile_range: [ + type: {:custom, Scholar.Options, :quantile_range, []}, + default: {25.0, 75.0}, + doc: """ + Quantile range as a tuple {q_min, q_max} defining the range of quantiles + to include. Must satisfy 0.0 < q_min < q_max < 100.0. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Compute the median and quantiles to be used for scaling. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return values + + Returns a struct with the following parameters: + + * `:iqr` - the calculated interquartile range. + + * `:medians` - the calculated medians of each feature across samples. + + ## Examples + + iex> Scholar.Preprocessing.RobustScaler.fit(Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])) + %Scholar.Preprocessing.RobustScaler{ + medians: Nx.tensor([1, 0, 0]), + iqr: Nx.tensor([1.0, 1.0, 1.5]) + } + """ + deftransform fit(tensor, opts \\ []) do + fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema)) + end + + defnp fit_n(tensor, opts) do + check_for_rank(tensor) + + {q_min, q_max} = opts[:quantile_range] + + medians = Nx.median(tensor, axis: 0) + + sorted_tensor = Nx.sort(tensor, axis: 0) + + q_min = percentile(sorted_tensor, q_min) + q_max = percentile(sorted_tensor, q_max) + + iqr = q_max - q_min + + %__MODULE__{medians: medians, iqr: iqr} + end + + @doc """ + Performs centering and scaling of the tensor using a fitted scaler. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> scaler = Scholar.Preprocessing.RobustScaler.fit(t) + %Scholar.Preprocessing.RobustScaler{ + medians: Nx.tensor([1, 0, 0]), + iqr: Nx.tensor([1.0, 1.0, 1.5]) + } + iex> Scholar.Preprocessing.RobustScaler.transform(scaler, t) + #Nx.Tensor< + f32[3][3] + [ + [0.0, -1.0, 1.3333333730697632], + [1.0, 0.0, 0.0], + [-1.0, 1.0, -0.6666666865348816] + ] + > + """ + defn transform(%__MODULE__{medians: medians, iqr: iqr}, tensor) do + check_for_rank(tensor) + scale(tensor, medians, iqr) + end + + @doc """ + Computes the scaling parameters and applies them to transform the tensor. + + ## Examples + + iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + iex> Scholar.Preprocessing.RobustScaler.fit_transform(t) + #Nx.Tensor< + f32[3][3] + [ + [0.0, -1.0, 1.3333333730697632], + [1.0, 0.0, 0.0], + [-1.0, 1.0, -0.6666666865348816] + ] + > + """ + defn fit_transform(tensor, opts \\ []) do + tensor + |> fit(opts) + |> transform(tensor) + end + + defnp scale(tensor, medians, iqr) do + (tensor - medians) / Nx.select(iqr == 0, 1.0, iqr) + end + + defnp percentile(sorted_tensor, p) do + num_rows = Nx.axis_size(sorted_tensor, 0) + idx = p / 100 * (num_rows - 1) + + lower_idx = Nx.floor(idx) |> Nx.as_type(:s64) + upper_idx = Nx.ceil(idx) |> Nx.as_type(:s64) + + lower_values = Nx.take(sorted_tensor, lower_idx, axis: 0) + upper_values = Nx.take(sorted_tensor, upper_idx, axis: 0) + + weight_upper = idx - Nx.floor(idx) + weight_lower = 1.0 - weight_upper + lower_values * weight_lower + upper_values * weight_upper + end + + defnp check_for_rank(tensor) do + if Nx.rank(tensor) != 2 do + raise ArgumentError, + """ + expected tensor to have shape {num_samples, num_features}, \ + got tensor with shape: #{inspect(Nx.shape(tensor))}\ + """ + end + end +end diff --git a/test/scholar/preprocessing/robust_scaler_test.exs b/test/scholar/preprocessing/robust_scaler_test.exs new file mode 100644 index 00000000..97db0a69 --- /dev/null +++ b/test/scholar/preprocessing/robust_scaler_test.exs @@ -0,0 +1,116 @@ +defmodule Scholar.Preprocessing.RobustScalerTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.RobustScaler + doctest RobustScaler + + describe "fit_transform" do + test "applies scaling to data" do + data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + + expected = + Nx.tensor([ + [0.0, -1.0, 1.3333333333333333], + [1.0, 0.0, 0.0], + [-1.0, 1.0, -0.6666666666666666] + ]) + + assert_all_close(RobustScaler.fit_transform(data), expected) + end + + test "applies scaling to data with custom quantile range" do + data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]) + + expected = + Nx.tensor([ + [0.0, -0.7142857142857142, 1.0], + [0.7142857142857142, 0.0, 0.0], + [-0.7142857142857142, 0.7142857142857142, -0.5] + ]) + + assert_all_close( + RobustScaler.fit_transform(data, quantile_range: {10, 80}), + expected + ) + end + + test "handles constant data (all values the same)" do + data = Nx.tensor([[5, 5, 5], [5, 5, 5], [5, 5, 5]]) + expected = Nx.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + + assert_all_close(RobustScaler.fit_transform(data), expected) + end + + test "handles already scaled data" do + data = Nx.tensor([[0, -1, 1], [1, 0, 0], [-1, 1, -1]]) + expected = data + + assert_all_close(RobustScaler.fit_transform(data), expected) + end + + test "handles single-row tensor" do + data = Nx.tensor([[1, 2, 3]]) + expected = Nx.tensor([[0.0, 0.0, 0.0]]) + + assert_all_close(RobustScaler.fit_transform(data), expected) + end + + test "handles single-column tensor" do + data = Nx.tensor([[1], [2], [3]]) + expected = Nx.tensor([[-1.0], [0.0], [1.0]]) + + assert_all_close(RobustScaler.fit_transform(data), expected) + end + + test "handles data with negative values only" do + data = Nx.tensor([[-5, -10, -15], [-15, -5, -20], [-10, -15, -5]]) + + expected = + Nx.tensor([ + [1.0, 0.0, 0.0], + [-1.0, 1.0, -0.6666666666666666], + [0.0, -1.0, 1.3333333333333333] + ]) + + assert_all_close(RobustScaler.fit_transform(data), expected) + end + + test "handles data with extreme outliers" do + data = Nx.tensor([[1, 2, 3], [1000, 2000, 3000], [-1000, -2000, -3000]]) + + expected = + Nx.tensor([[0.0, 0.0, 0.0], [0.999, 0.999, 0.999], [-1.001, -1.001, -1.001]]) + + assert_all_close( + RobustScaler.fit_transform(data), + expected + ) + end + end + + describe "errors" do + test "wrong input rank for fit" do + assert_raise ArgumentError, + "expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}", + fn -> + RobustScaler.fit(Nx.tensor([[[1]]])) + end + end + + test "wrong input rank for transform" do + assert_raise ArgumentError, + "expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}", + fn -> + RobustScaler.fit(Nx.tensor([[1]])) + |> RobustScaler.transform(Nx.tensor([[[1]]])) + end + end + + test "wrong quantile range" do + assert_raise NimbleOptions.ValidationError, + "invalid value for :quantile_range option: expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: {10, 800}", + fn -> + RobustScaler.fit(Nx.tensor([[[1]]]), quantile_range: {10, 800}) + end + end + end +end