diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex new file mode 100644 index 00000000..cd9cc505 --- /dev/null +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -0,0 +1,432 @@ +defmodule Scholar.NaiveBayes.Bernoulli do + @moduledoc """ + Naive Bayes classifier for multivariate Bernoulli models. + + Like MultinomialNB, this classifier is suitable for discrete data. The + difference is that while MultinomialNB works with occurrence counts, + BernoulliNB is designed for binary/boolean features. + """ + import Nx.Defn + import Scholar.Shared + + @derive {Nx.Container, + containers: [ + :feature_count, + :class_count, + :class_log_priors, + :feature_log_probability + ]} + defstruct [:feature_count, :class_count, :class_log_priors, :feature_log_probability] + + opts_schema = [ + num_classes: [ + type: :pos_integer, + required: true, + doc: ~S""" + Number of different classes used in training. + """ + ], + alpha: [ + type: {:or, [:float, {:list, :float}]}, + default: 1.0, + doc: ~S""" + Additive (Laplace/Lidstone) smoothing parameter + (set alpha to 0.0 and force_alpha to true, for no smoothing). + """ + ], + force_alpha: [ + type: :boolean, + default: true, + doc: ~S""" + If `false` and alpha is less than 1e-10, it will set alpha to + 1e-10. If `true`, alpha will remain unchanged. This may cause + numerical errors if alpha is too close to 0. + """ + ], + binarize: [ + type: {:or, [:float, {:in, [nil]}]}, + default: 0.0, + doc: ~S""" + Threshold for binarizing (mapping to booleans) of sample features. + If nil, input is presumed to already consist of binary vectors. + """ + ], + fit_priors: [ + type: :boolean, + default: true, + doc: ~S""" + Whether to learn class prior probabilities or not. + If `false`, a uniform prior will be used. + """ + ], + class_priors: [ + type: {:custom, Scholar.Options, :weights, []}, + doc: ~S""" + Prior probabilities of the classes. If specified, the priors are not + adjusted according to the data. + """ + ], + sample_weights: [ + type: {:custom, Scholar.Options, :weights, []}, + doc: ~S""" + List of `num_samples` elements. + A list of 1.0 values is used if none is given. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Fits a naive Bayes model. The function assumes that the targets `y` are integers + between 0 and `num_classes` - 1 (inclusive). Otherwise, those samples will not + contribute to `class_count`. + + ## Options + + #{NimbleOptions.docs(@opts_schema)} + + ## Return Values + + The function returns a struct with the following parameters: + + * `:class_count` - Number of samples encountered for each class during fitting. This + value is weighted by the sample weight when provided. + + * `:class_log_priors` - Smoothed empirical log probability for each class. + + * `:feature_count` - Number of samples encountered for each (class, feature) + during fitting. This value is weighted by the sample weight when + provided. + + * `:feature_log_probability` - Empirical log probability of features + given a class, ``P(x_i|y)``. + + ## Examples + + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3, binarize: 1.0) + %Scholar.NaiveBayes.Bernoulli{ + feature_count: Nx.tensor( + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 1.0], + [2.0, 2.0, 2.0] + ] + ), + class_count: Nx.tensor( + [1.0, 1.0, 2.0] + ), + class_log_priors: Nx.tensor( + [-1.3862943649291992, -1.3862943649291992, -0.6931471824645996] + ), + feature_log_probability: Nx.tensor( + [ + [-0.40546512603759766, -0.40546512603759766, -0.40546512603759766], + [-1.0986123085021973, -1.0986123085021973, -0.40546512603759766], + [-0.28768205642700195, -0.28768205642700195, -0.28768205642700195] + ] + ) + } + + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3, force_alpha: false, alpha: 0.0) + %Scholar.NaiveBayes.Bernoulli{ + feature_count: Nx.tensor( + [ + [1.0, 1.0, 1.0], + [0.0, 1.0, 1.0], + [2.0, 2.0, 2.0] + ] + ), + class_count: Nx.tensor( + [1.0, 1.0, 2.0] + ), + class_log_priors: Nx.tensor( + [-1.3862943649291992, -1.3862943649291992, -0.6931471824645996] + ), + feature_log_probability: Nx.tensor( + [ + [0.0, 0.0, 0.0], + [-23.025850296020508, 0.0, 0.0], + [0.0, 0.0, 0.0] + ] + ) + } + """ + + deftransform fit(x, y, opts \\ []) do + if Nx.rank(x) != 2 do + raise ArgumentError, + """ + expected x to have shape {num_samples, num_features}, \ + got tensor with shape: #{inspect(Nx.shape(x))}\ + """ + end + + if Nx.rank(y) != 1 do + raise ArgumentError, + """ + expected y to have shape {num_samples}, \ + got tensor with shape: #{inspect(Nx.shape(y))}\ + """ + end + + {num_samples, num_features} = Nx.shape(x) + + if num_samples != Nx.axis_size(y, 0) do + raise ArgumentError, + """ + expected first dimension of x and y to be of same size, \ + got: #{num_samples} and #{Nx.axis_size(y, 0)}\ + """ + end + + opts = NimbleOptions.validate!(opts, @opts_schema) + type = to_float_type(x) + + {alpha, opts} = Keyword.pop!(opts, :alpha) + alpha = Nx.tensor(alpha, type: type) + + if Nx.shape(alpha) not in [{}, {num_features}] do + raise ArgumentError, + """ + when alpha is list it should have length equal to num_features = #{num_features}, \ + got: #{Nx.size(alpha)}\ + """ + end + + num_classes = opts[:num_classes] + + priors_flag = opts[:class_priors] != nil + + {class_priors, opts} = Keyword.pop(opts, :class_priors, :nan) + class_priors = Nx.tensor(class_priors) + + if priors_flag and Nx.size(class_priors) != num_classes do + raise ArgumentError, + """ + expected class_priors to be list of length num_classes = #{num_classes}, \ + got: #{Nx.size(class_priors)}\ + """ + end + + sample_weights_flag = opts[:sample_weights] != nil + + {sample_weights, opts} = Keyword.pop(opts, :sample_weights, :nan) + sample_weights = Nx.tensor(sample_weights, type: type) + + if sample_weights_flag and Nx.shape(sample_weights) != {num_samples} do + raise ArgumentError, + """ + expected sample_weights to be list of length num_samples = #{num_samples}, \ + got: #{Nx.size(sample_weights)}\ + """ + end + + opts = + opts ++ + [ + type: type, + priors_flag: priors_flag, + sample_weights_flag: sample_weights_flag + ] + + fit_n(x, y, class_priors, sample_weights, alpha, opts) + end + + defnp fit_n(x, y, class_priors, sample_weights, alpha, opts) do + type = opts[:type] + num_samples = Nx.axis_size(x, 0) + + num_classes = opts[:num_classes] + + x = + case opts[:binarize] do + nil -> x + binarize -> Scholar.Preprocessing.Binarizer.fit_transform(x, threshold: binarize) + end + + y_one_hot = Scholar.Preprocessing.OneHotEncoder.fit_transform(y, num_categories: num_classes) + y_one_hot = Nx.select(y_one_hot, Nx.tensor(1, type: type), Nx.tensor(0, type: type)) + + y_weighted = + if opts[:sample_weights_flag], + do: Nx.reshape(sample_weights, {num_samples, 1}) * y_one_hot, + else: y_one_hot + + alpha_lower_bound = Nx.tensor(1.0e-10, type: type) + + alpha = + if opts[:force_alpha], do: alpha, else: Nx.max(alpha, alpha_lower_bound) + + class_count = Nx.sum(y_weighted, axes: [0]) + feature_count = Nx.dot(y_weighted, [0], x, [0]) + + smoothed_feature_count = feature_count + alpha + smoothed_cumulative_count = class_count + alpha * 2 + + feature_log_probability = + Nx.log(smoothed_feature_count) - + Nx.log(Nx.reshape(smoothed_cumulative_count, {num_classes, 1})) + + class_log_priors = + cond do + opts[:priors_flag] -> + Nx.log(class_priors) + + opts[:fit_priors] -> + Nx.log(class_count) - Nx.log(Nx.sum(class_count)) + + true -> + Nx.broadcast(-Nx.log(num_classes), {num_classes}) + end + + %__MODULE__{ + class_count: class_count, + class_log_priors: class_log_priors, + feature_count: feature_count, + feature_log_probability: feature_log_probability + } + end + + @doc """ + Perform classification on an array of test vectors `x` using `model`. + You need to add sorted classes from the training data as the second argument. + + ## Examples + + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]), Nx.tensor([0, 1, 2])) + #Nx.Tensor< + s32[2] + [2, 2] + > + """ + + defn predict(%__MODULE__{} = model, x, classes) do + check_dim(x, Nx.axis_size(model.feature_count, 1)) + + if Nx.rank(classes) != 1 do + raise ArgumentError, + """ + expected classes to be a 1D tensor, \ + got tensor with shape: #{inspect(Nx.shape(classes))}\ + """ + end + + if Nx.axis_size(classes, 0) != Nx.axis_size(model.class_count, 0) do + raise ArgumentError, + """ + expected classes to have same size as the number of classes in the model, \ + got: #{Nx.axis_size(classes, 0)} for classes and #{Nx.axis_size(model.class_count, 0)} for model\ + """ + end + + jll = joint_log_likelihood(model, x) + classes[Nx.argmax(jll, axis: 1)] + end + + @doc """ + Return log-probability estimates for the test vector `x` using `model`. + + ## Examples + + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict_log_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]])) + #Nx.Tensor< + f32[2][3] + [ + [-4.704780578613281, -12.329399108886719, -0.009097099304199219], + [-8.750494003295898, -19.147701263427734, -1.583099365234375e-4] + ] + > + """ + + defn predict_log_probability(%__MODULE__{} = model, x) do + check_dim(x, Nx.axis_size(model.feature_count, 1)) + jll = joint_log_likelihood(model, x) + + log_proba_x = + jll + |> Nx.logsumexp(axes: [1]) + |> Nx.reshape({Nx.axis_size(jll, 0), 1}) + + jll - log_proba_x + end + + @doc """ + Return probability estimates for the test vector `x` using `model`. + + ## Examples + + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]])) + #Nx.Tensor< + f32[2][3] + [ + [0.00905190035700798, 4.4198750401847064e-6, 0.9909441471099854], + [1.5838305989746004e-4, 4.833469624543341e-9, 0.9998416900634766] + ] + > + """ + + defn predict_probability(%__MODULE__{} = model, x) do + Nx.exp(predict_log_probability(model, x)) + end + + @doc """ + Return joint log probability estimates for the test vector `x` using `model`. + + ## Examples + + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict_joint_log_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]])) + #Nx.Tensor< + f32[2][3] + [ + [3.6356334686279297, -3.988985061645508, 8.331316947937012], + [10.56710433959961, 0.16989731788635254, 19.317440032958984] + ] + > + """ + + defn predict_joint_log_probability(%__MODULE__{} = model, x) do + check_dim(x, Nx.axis_size(model.feature_count, 1)) + joint_log_likelihood(model, x) + end + + defnp check_dim(x, dim) do + num_features = Nx.axis_size(x, 1) + + if num_features != dim do + raise ArgumentError, + """ + expected x to have same second dimension as data used for fitting model, \ + got: #{num_features} for x and #{dim} for training data\ + """ + end + end + + defnp joint_log_likelihood( + %__MODULE__{ + feature_log_probability: feature_log_probability, + class_log_priors: class_log_priors + }, + x + ) do + neg_prob = Nx.log(1 - Nx.exp(feature_log_probability)) + jll = Nx.dot(x, [1], feature_log_probability - neg_prob, [1]) + jll + class_log_priors + Nx.sum(neg_prob, axes: [1]) + end +end diff --git a/lib/scholar/preprocessing/binarizer.ex b/lib/scholar/preprocessing/binarizer.ex new file mode 100644 index 00000000..889ce1d5 --- /dev/null +++ b/lib/scholar/preprocessing/binarizer.ex @@ -0,0 +1,56 @@ +defmodule Scholar.Preprocessing.Binarizer do + @moduledoc """ + Binarize data according to a threshold. + """ + import Nx.Defn + + binarize_schema = [ + threshold: [ + type: :float, + default: 0.0, + doc: """ + Feature values below or equal to this are replaced by 0, above it by 1. + Threshold may not be less than 0 for operations on sparse matrices. + """ + ] + ] + + @binarize_schema NimbleOptions.new!(binarize_schema) + + @doc """ + Values greater than the threshold map to 1, while values less than + or equal to the threshold map to 0. With the default threshold of 0, + only positive values map to 1. + ## Options + #{NimbleOptions.docs(@binarize_schema)} + ## Examples + iex> t = Nx.tensor([[0, 0, 0], [3, 4, 5], [-2, 4, 3]]) + iex> Scholar.Preprocessing.Binarizer.fit_transform(t, threshold: 3.0) + #Nx.Tensor< + u8[3][3] + [ + [0, 0, 0], + [0, 1, 1], + [0, 1, 0] + ] + > + iex> t = Nx.tensor([[0, 0, 0], [3, 4, 5], [-2, 4, 3]]) + iex> Scholar.Preprocessing.Binarizer.fit_transform(t,threshold: 0.4) + #Nx.Tensor< + u8[3][3] + [ + [0, 0, 0], + [1, 1, 1], + [0, 1, 1] + ] + > + """ + deftransform fit_transform(tensor, opts \\ []) do + binarize_n(tensor, NimbleOptions.validate!(opts, @binarize_schema)) + end + + defnp binarize_n(tensor, opts) do + threshold = opts[:threshold] + tensor > threshold + end +end diff --git a/test/scholar/naive_bayes/bernoulli_test.exs b/test/scholar/naive_bayes/bernoulli_test.exs new file mode 100644 index 00000000..77adaad5 --- /dev/null +++ b/test/scholar/naive_bayes/bernoulli_test.exs @@ -0,0 +1,173 @@ +defmodule Scholar.NaiveBayes.BernoulliTest do + use Scholar.Case, async: true + alias Scholar.NaiveBayes.Bernoulli + doctest Bernoulli + + describe "fit" do + test "binary y" do + x = Nx.iota({5, 6}) + x = Scholar.Preprocessing.Binarizer.fit_transform(x) + y = Nx.tensor([1, 0, 1, 0, 1]) + + model = Bernoulli.fit(x, y, num_classes: 2, binarize: nil) + + assert model.feature_count == + Nx.tensor([ + [2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [2.0, 3.0, 3.0, 3.0, 3.0, 3.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.51082562, -0.22314355, -0.22314355, -0.22314355, -0.22314355, -0.22314355] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([ + -0.91629073, + -0.51082562 + ]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 3.0]) + end + + test ":alpha set to a different value" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 2, 6, 3, 1]) + + model = Bernoulli.fit(x, y, num_classes: 4, alpha: 0.4) + + assert model.feature_count == + Nx.tensor([ + [1.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.69314718, -0.15415068, -0.15415068, -0.15415068, -0.15415068, -0.15415068], + [-0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443], + [-0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443], + [-0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([-0.91629073, -1.60943791, -1.60943791, -1.60943791]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + assert_all_close(expected_class_log_priors, model.class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 1.0, 1.0, 1.0]) + end + + test ":fit_priors set to false" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 0, 1, 0, 1]) + + model = Bernoulli.fit(x, y, num_classes: 2, fit_priors: false) + + assert model.feature_count == + Nx.tensor([ + [2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [2.0, 3.0, 3.0, 3.0, 3.0, 3.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.51082562, -0.22314355, -0.22314355, -0.22314355, -0.22314355, -0.22314355] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([-0.69314718, -0.69314718]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + assert_all_close(expected_class_log_priors, model.class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 3.0]) + end + + # + test "fit test - :class_priors are set as a list" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 2, 3, 2, 1]) + + model = Bernoulli.fit(x, y, num_classes: 3, class_priors: [0.3, 0.4, 0.3]) + + assert model.feature_count == + Nx.tensor([ + [1.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.69314718, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.40546511, -0.40546511, -0.40546511, -0.40546511, -0.40546511, -0.40546511] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([-1.2039728, -0.91629073, -1.2039728]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + assert_all_close(expected_class_log_priors, model.class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 2.0, 1.0]) + end + + test "error handling for wrong input shapes" do + assert_raise ArgumentError, + "expected x to have shape {num_samples, num_features}, got tensor with shape: {4}", + fn -> + Bernoulli.fit( + Nx.tensor([1, 2, 3, 4]), + Nx.tensor([1, 0, 1, 0]), + num_classes: 2 + ) + end + + assert_raise ArgumentError, + "expected y to have shape {num_samples}, got tensor with shape: {1, 4}", + fn -> + Bernoulli.fit( + Nx.tensor([[1, 2, 3, 4]]), + Nx.tensor([[1, 0, 1, 0]]), + num_classes: 2 + ) + end + end + end + + describe "predict" do + test "predicts classes correctly for new data" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 2, 3, 4, 5]) + + jit_model = Nx.Defn.jit(&Bernoulli.fit/3) + model = jit_model.(x, y, num_classes: 5) + + x_test = Nx.tensor([[1, 2, 3, 4, 5, 6], [0, 0, 0, 0, 0, 0]]) + + jit_predict = Nx.Defn.jit(&Bernoulli.predict/3) + predictions = jit_predict.(model, x_test, Nx.tensor([1, 2, 3, 4, 5])) + + expected_predictions = Nx.tensor([2, 1]) + assert predictions == expected_predictions + end + end +end diff --git a/test/scholar/preprocessing/binarizer_test.exs b/test/scholar/preprocessing/binarizer_test.exs new file mode 100644 index 00000000..8bf33142 --- /dev/null +++ b/test/scholar/preprocessing/binarizer_test.exs @@ -0,0 +1,34 @@ +defmodule Scholar.Preprocessing.BinarizerTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.Binarizer + doctest Binarizer + + describe "binarization" do + test "binarize with positive threshold" do + tensor = Nx.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [-2.0, -1.0, 0.0]]) + + jit_binarizer = Nx.Defn.jit(&Binarizer.fit_transform/2) + + result = jit_binarizer.(tensor, threshold: 2.0) + + assert Nx.to_flat_list(result) == [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0] + end + + test "binarize values with default threshold" do + tensor = Nx.tensor([[0.0, -1.0, 2.0], [3.0, 4.0, -5.0], [-2.0, 1.0, 0.0]]) + + result = Binarizer.fit_transform(tensor) + + assert Nx.to_flat_list(result) == [0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + end + + test "binarize with threshold less than 0" do + tensor = Nx.tensor([[0.0, 0.5, -0.5], [-0.1, -0.2, -0.3]]) + jit_binarizer = Nx.Defn.jit(&Binarizer.fit_transform/2) + + result = jit_binarizer.(tensor, threshold: -0.2) + + assert Nx.to_flat_list(result) == [1.0, 1.0, 0.0, 1.0, 0.0, 0.0] + end + end +end