Skip to content

Commit

Permalink
Add RobustScaler (#314)
Browse files Browse the repository at this point in the history
  • Loading branch information
ksew1 authored Dec 31, 2024
1 parent c11afad commit bc50857
Show file tree
Hide file tree
Showing 3 changed files with 278 additions and 0 deletions.
13 changes: 13 additions & 0 deletions lib/scholar/options.ex
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,17 @@ defmodule Scholar.Options do
{:error, "expected 'beta' to be in the range [0, inf]"}
end
end

def quantile_range(value) do
case value do
{q_min, q_max}
when is_number(q_min) and is_number(q_max) and 0.0 < q_min and q_min < q_max and
q_max < 100.0 ->
{:ok, {q_min, q_max}}

_ ->
{:error,
"expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: #{inspect(value)}"}
end
end
end
149 changes: 149 additions & 0 deletions lib/scholar/preprocessing/robust_scaler.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
defmodule Scholar.Preprocessing.RobustScaler do
@moduledoc ~S"""
Scale features using statistics that are robust to outliers.
This Scaler removes the median and scales the data according to
the quantile range (defaults to IQR: Interquartile Range).
The IQR is the range between the 1st quartile (25th quantile)
and the 3rd quartile (75th quantile).
"""

import Nx.Defn

@derive {Nx.Container, containers: [:medians, :iqr]}
defstruct [:medians, :iqr]

opts_schema = [
quantile_range: [
type: {:custom, Scholar.Options, :quantile_range, []},
default: {25.0, 75.0},
doc: """
Quantile range as a tuple {q_min, q_max} defining the range of quantiles
to include. Must satisfy 0.0 < q_min < q_max < 100.0.
"""
]
]

@opts_schema NimbleOptions.new!(opts_schema)

@doc """
Compute the median and quantiles to be used for scaling.
## Options
#{NimbleOptions.docs(@opts_schema)}
## Return values
Returns a struct with the following parameters:
* `:iqr` - the calculated interquartile range.
* `:medians` - the calculated medians of each feature across samples.
## Examples
iex> Scholar.Preprocessing.RobustScaler.fit(Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]]))
%Scholar.Preprocessing.RobustScaler{
medians: Nx.tensor([1, 0, 0]),
iqr: Nx.tensor([1.0, 1.0, 1.5])
}
"""
deftransform fit(tensor, opts \\ []) do
fit_n(tensor, NimbleOptions.validate!(opts, @opts_schema))
end

defnp fit_n(tensor, opts) do
check_for_rank(tensor)

{q_min, q_max} = opts[:quantile_range]

medians = Nx.median(tensor, axis: 0)

sorted_tensor = Nx.sort(tensor, axis: 0)

q_min = percentile(sorted_tensor, q_min)
q_max = percentile(sorted_tensor, q_max)

iqr = q_max - q_min

%__MODULE__{medians: medians, iqr: iqr}
end

@doc """
Performs centering and scaling of the tensor using a fitted scaler.
## Examples
iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
iex> scaler = Scholar.Preprocessing.RobustScaler.fit(t)
%Scholar.Preprocessing.RobustScaler{
medians: Nx.tensor([1, 0, 0]),
iqr: Nx.tensor([1.0, 1.0, 1.5])
}
iex> Scholar.Preprocessing.RobustScaler.transform(scaler, t)
#Nx.Tensor<
f32[3][3]
[
[0.0, -1.0, 1.3333333730697632],
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666865348816]
]
>
"""
defn transform(%__MODULE__{medians: medians, iqr: iqr}, tensor) do
check_for_rank(tensor)
scale(tensor, medians, iqr)
end

@doc """
Computes the scaling parameters and applies them to transform the tensor.
## Examples
iex> t = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
iex> Scholar.Preprocessing.RobustScaler.fit_transform(t)
#Nx.Tensor<
f32[3][3]
[
[0.0, -1.0, 1.3333333730697632],
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666865348816]
]
>
"""
defn fit_transform(tensor, opts \\ []) do
tensor
|> fit(opts)
|> transform(tensor)
end

defnp scale(tensor, medians, iqr) do
(tensor - medians) / Nx.select(iqr == 0, 1.0, iqr)
end

defnp percentile(sorted_tensor, p) do
num_rows = Nx.axis_size(sorted_tensor, 0)
idx = p / 100 * (num_rows - 1)

lower_idx = Nx.floor(idx) |> Nx.as_type(:s64)
upper_idx = Nx.ceil(idx) |> Nx.as_type(:s64)

lower_values = Nx.take(sorted_tensor, lower_idx, axis: 0)
upper_values = Nx.take(sorted_tensor, upper_idx, axis: 0)

weight_upper = idx - Nx.floor(idx)
weight_lower = 1.0 - weight_upper
lower_values * weight_lower + upper_values * weight_upper
end

defnp check_for_rank(tensor) do
if Nx.rank(tensor) != 2 do
raise ArgumentError,
"""
expected tensor to have shape {num_samples, num_features}, \
got tensor with shape: #{inspect(Nx.shape(tensor))}\
"""
end
end
end
116 changes: 116 additions & 0 deletions test/scholar/preprocessing/robust_scaler_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
defmodule Scholar.Preprocessing.RobustScalerTest do
use Scholar.Case, async: true
alias Scholar.Preprocessing.RobustScaler
doctest RobustScaler

describe "fit_transform" do
test "applies scaling to data" do
data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])

expected =
Nx.tensor([
[0.0, -1.0, 1.3333333333333333],
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666666666666]
])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "applies scaling to data with custom quantile range" do
data = Nx.tensor([[1, -1, 2], [2, 0, 0], [0, 1, -1]])

expected =
Nx.tensor([
[0.0, -0.7142857142857142, 1.0],
[0.7142857142857142, 0.0, 0.0],
[-0.7142857142857142, 0.7142857142857142, -0.5]
])

assert_all_close(
RobustScaler.fit_transform(data, quantile_range: {10, 80}),
expected
)
end

test "handles constant data (all values the same)" do
data = Nx.tensor([[5, 5, 5], [5, 5, 5], [5, 5, 5]])
expected = Nx.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles already scaled data" do
data = Nx.tensor([[0, -1, 1], [1, 0, 0], [-1, 1, -1]])
expected = data

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles single-row tensor" do
data = Nx.tensor([[1, 2, 3]])
expected = Nx.tensor([[0.0, 0.0, 0.0]])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles single-column tensor" do
data = Nx.tensor([[1], [2], [3]])
expected = Nx.tensor([[-1.0], [0.0], [1.0]])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles data with negative values only" do
data = Nx.tensor([[-5, -10, -15], [-15, -5, -20], [-10, -15, -5]])

expected =
Nx.tensor([
[1.0, 0.0, 0.0],
[-1.0, 1.0, -0.6666666666666666],
[0.0, -1.0, 1.3333333333333333]
])

assert_all_close(RobustScaler.fit_transform(data), expected)
end

test "handles data with extreme outliers" do
data = Nx.tensor([[1, 2, 3], [1000, 2000, 3000], [-1000, -2000, -3000]])

expected =
Nx.tensor([[0.0, 0.0, 0.0], [0.999, 0.999, 0.999], [-1.001, -1.001, -1.001]])

assert_all_close(
RobustScaler.fit_transform(data),
expected
)
end
end

describe "errors" do
test "wrong input rank for fit" do
assert_raise ArgumentError,
"expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}",
fn ->
RobustScaler.fit(Nx.tensor([[[1]]]))
end
end

test "wrong input rank for transform" do
assert_raise ArgumentError,
"expected tensor to have shape {num_samples, num_features}, got tensor with shape: {1, 1, 1}",
fn ->
RobustScaler.fit(Nx.tensor([[1]]))
|> RobustScaler.transform(Nx.tensor([[[1]]]))
end
end

test "wrong quantile range" do
assert_raise NimbleOptions.ValidationError,
"invalid value for :quantile_range option: expected :quantile_range to be a tuple {q_min, q_max} such that 0.0 < q_min < q_max < 100.0, got: {10, 800}",
fn ->
RobustScaler.fit(Nx.tensor([[[1]]]), quantile_range: {10, 800})
end
end
end
end

0 comments on commit bc50857

Please sign in to comment.