Add RobustScaler (#314)

elixir-nx · Dec 31, 2024 · bc50857 · bc50857
1 parent c11afad
commit bc50857
Show file tree

Hide file tree

Showing 3 changed files with 278 additions and 0 deletions.
diff --git a/lib/scholar/options.ex b/lib/scholar/options.ex
@@ -108,4 +108,17 @@ defmodule Scholar.Options do
       {:error, "expected 'beta' to be in the range [0, inf]"}
     end
   end
+
+  def quantile_range(value) do
+    case value do
+      {q_min, q_max}
+      when is_number(q_min) and is_number(q_max) and 0.0 < q_min and q_min < q_max and
+             q_max < 100.0 ->
+        {:ok, {q_min, q_max}}
+
+      _ ->
+        {:error,
+         "expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: #{inspect(value)}"}
+    end
+  end
 end
diff --git a/lib/scholar/preprocessing/robust_scaler.ex b/lib/scholar/preprocessing/robust_scaler.ex
@@ -0,0 +1,149 @@
+defmodule Scholar.Preprocessing.RobustScaler do
+  @moduledoc ~S"""
+  Scale features using statistics that are robust to outliers.
+
+  This Scaler removes the median and scales the data according to
+  the quantile range (defaults to IQR: Interquartile Range).
+  The IQR is the range between the 1st quartile (25th quantile)
+  and the 3rd quartile (75th quantile).
+  """
+
+  import Nx.Defn
+
+  @derive {Nx.Container, containers: [:medians, :iqr]}
+  defstruct [:medians, :iqr]
+
+  opts_schema = [
+    quantile_range: [
+      type: {:custom, Scholar.Options, :quantile_range, []},
+      default: {25.0, 75.0},
+      doc: """
+      Quantile range as a tuple {q_min, q_max} defining the range of quantiles
+      to include. Must satisfy 0.0 < q_min < q_max < 100.0.
+      """
+    ]
+  ]
+
+  @opts_schema NimbleOptions.new!(opts_schema)
+
+  @doc """
+  Compute the median and quantiles to be used for scaling.
+
+  ## Options
+
+  #{NimbleOptions.docs(@opts_schema)}
+
+  ## Return values
+
+  Returns a struct with the following parameters:
+
+  * `:iqr` - the calculated interquartile range.
+
+  * `:medians` - the calculated medians of each feature across samples.
+
+  ## Examples
+
+      iex> Scholar.Preprocessing.RobustScaler.fit(Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]))
+      %Scholar.Preprocessing.RobustScaler{
+        medians: Nx.tensor([1, 0, 0]),
+        iqr: Nx.tensor([1.0, 1.0, 1.5])
+      }
+  """
+  deftransform fit(tensor, opts \\ []) do
+    fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema))
+  end
+
+  defnp fit_n(tensor, opts) do
+    check_for_rank(tensor)
+
+    {q_min, q_max} = opts[:quantile_range]
+
+    medians = Nx.median(tensor, axis: 0)
+
+    sorted_tensor = Nx.sort(tensor, axis: 0)
+
+    q_min = percentile(sorted_tensor, q_min)
+    q_max = percentile(sorted_tensor, q_max)
+
+    iqr = q_max - q_min
+
+    %__MODULE__{medians: medians, iqr: iqr}
+  end
+
+  @doc """
+  Performs centering and scaling of the tensor using a fitted scaler.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> scaler = Scholar.Preprocessing.RobustScaler.fit(t)
+      %Scholar.Preprocessing.RobustScaler{
+        medians: Nx.tensor([1, 0, 0]),
+        iqr: Nx.tensor([1.0, 1.0, 1.5])
+      }
+      iex> Scholar.Preprocessing.RobustScaler.transform(scaler, t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.0, -1.0, 1.3333333730697632],
+          [1.0, 0.0, 0.0],
+          [-1.0, 1.0, -0.6666666865348816]
+        ]
+      >
+  """
+  defn transform(%__MODULE__{medians: medians, iqr: iqr}, tensor) do
+    check_for_rank(tensor)
+    scale(tensor, medians, iqr)
+  end
+
+  @doc """
+  Computes the scaling parameters and applies them to transform the tensor.
+
+  ## Examples
+
+      iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+      iex> Scholar.Preprocessing.RobustScaler.fit_transform(t)
+      #Nx.Tensor<
+        f32[3][3]
+        [
+          [0.0, -1.0, 1.3333333730697632],
+          [1.0, 0.0, 0.0],
+          [-1.0, 1.0, -0.6666666865348816]
+        ]
+      >
+  """
+  defn fit_transform(tensor, opts \\ []) do
+    tensor
+    |> fit(opts)
+    |> transform(tensor)
+  end
+
+  defnp scale(tensor, medians, iqr) do
+    (tensor - medians) / Nx.select(iqr == 0, 1.0, iqr)
+  end
+
+  defnp percentile(sorted_tensor, p) do
+    num_rows = Nx.axis_size(sorted_tensor, 0)
+    idx = p / 100 * (num_rows - 1)
+
+    lower_idx = Nx.floor(idx) |> Nx.as_type(:s64)
+    upper_idx = Nx.ceil(idx) |> Nx.as_type(:s64)
+
+    lower_values = Nx.take(sorted_tensor, lower_idx, axis: 0)
+    upper_values = Nx.take(sorted_tensor, upper_idx, axis: 0)
+
+    weight_upper = idx - Nx.floor(idx)
+    weight_lower = 1.0 - weight_upper
+    lower_values * weight_lower + upper_values * weight_upper
+  end
+
+  defnp check_for_rank(tensor) do
+    if Nx.rank(tensor) != 2 do
+      raise ArgumentError,
+            """
+            expected tensor to have shape {num_samples, num_features}, \
+            got tensor with shape: #{inspect(Nx.shape(tensor))}\
+            """
+    end
+  end
+end
diff --git a/test/scholar/preprocessing/robust_scaler_test.exs b/test/scholar/preprocessing/robust_scaler_test.exs
@@ -0,0 +1,116 @@
+defmodule Scholar.Preprocessing.RobustScalerTest do
+  use Scholar.Case, async: true
+  alias Scholar.Preprocessing.RobustScaler
+  doctest RobustScaler
+
+  describe "fit_transform" do
+    test "applies scaling to data" do
+      data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+
+      expected =
+        Nx.tensor([
+          [0.0, -1.0, 1.3333333333333333],
+          [1.0, 0.0, 0.0],
+          [-1.0, 1.0, -0.6666666666666666]
+        ])
+
+      assert_all_close(RobustScaler.fit_transform(data), expected)
+    end
+
+    test "applies scaling to data with custom quantile range" do
+      data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
+
+      expected =
+        Nx.tensor([
+          [0.0, -0.7142857142857142, 1.0],
+          [0.7142857142857142, 0.0, 0.0],
+          [-0.7142857142857142, 0.7142857142857142, -0.5]
+        ])
+
+      assert_all_close(
+        RobustScaler.fit_transform(data, quantile_range: {10, 80}),
+        expected
+      )
+    end
+
+    test "handles constant data (all values the same)" do
+      data = Nx.tensor([[5, 5, 5], [5, 5, 5], [5, 5, 5]])
+      expected = Nx.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+
+      assert_all_close(RobustScaler.fit_transform(data), expected)
+    end
+
+    test "handles already scaled data" do
+      data = Nx.tensor([[0, -1, 1], [1, 0, 0], [-1, 1, -1]])
+      expected = data
+
+      assert_all_close(RobustScaler.fit_transform(data), expected)
+    end
+
+    test "handles single-row tensor" do
+      data = Nx.tensor([[1, 2, 3]])
+      expected = Nx.tensor([[0.0, 0.0, 0.0]])
+
+      assert_all_close(RobustScaler.fit_transform(data), expected)
+    end
+
+    test "handles single-column tensor" do
+      data = Nx.tensor([[1], [2], [3]])
+      expected = Nx.tensor([[-1.0], [0.0], [1.0]])
+
+      assert_all_close(RobustScaler.fit_transform(data), expected)
+    end
+
+    test "handles data with negative values only" do
+      data = Nx.tensor([[-5, -10, -15], [-15, -5, -20], [-10, -15, -5]])
+
+      expected =
+        Nx.tensor([
+          [1.0, 0.0, 0.0],
+          [-1.0, 1.0, -0.6666666666666666],
+          [0.0, -1.0, 1.3333333333333333]
+        ])
+
+      assert_all_close(RobustScaler.fit_transform(data), expected)
+    end
+
+    test "handles data with extreme outliers" do
+      data = Nx.tensor([[1, 2, 3], [1000, 2000, 3000], [-1000, -2000, -3000]])
+
+      expected =
+        Nx.tensor([[0.0, 0.0, 0.0], [0.999, 0.999, 0.999], [-1.001, -1.001, -1.001]])
+
+      assert_all_close(
+        RobustScaler.fit_transform(data),
+        expected
+      )
+    end
+  end
+
+  describe "errors" do
+    test "wrong input rank for fit" do
+      assert_raise ArgumentError,
+                   "expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}",
+                   fn ->
+                     RobustScaler.fit(Nx.tensor([[[1]]]))
+                   end
+    end
+
+    test "wrong input rank for transform" do
+      assert_raise ArgumentError,
+                   "expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}",
+                   fn ->
+                     RobustScaler.fit(Nx.tensor([[1]]))
+                     |> RobustScaler.transform(Nx.tensor([[[1]]]))
+                   end
+    end
+
+    test "wrong quantile range" do
+      assert_raise NimbleOptions.ValidationError,
+                   "invalid value for :quantile_range option: expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: {10, 800}",
+                   fn ->
+                     RobustScaler.fit(Nx.tensor([[[1]]]), quantile_range: {10, 800})
+                   end
+    end
+  end
+end