From 78fb5eb03ab0d389fd31661f490da8922e00d452 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 30 Oct 2024 09:58:30 +0100 Subject: [PATCH 1/7] add bernoulliNB and binarizer --- lib/scholar/naive_bayes/bernoulli.ex | 407 ++++++++++++++++++ lib/scholar/preprocessing/binarizer.ex | 56 +++ test/scholar/naive_bayes/bernoulli_test.exs | 173 ++++++++ test/scholar/preprocessing/binarizer_test.exs | 34 ++ 4 files changed, 670 insertions(+) create mode 100644 lib/scholar/naive_bayes/bernoulli.ex create mode 100644 lib/scholar/preprocessing/binarizer.ex create mode 100644 test/scholar/naive_bayes/bernoulli_test.exs create mode 100644 test/scholar/preprocessing/binarizer_test.exs diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex new file mode 100644 index 00000000..f6da0e44 --- /dev/null +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -0,0 +1,407 @@ +defmodule Scholar.NaiveBayes.Bernoulli do + @moduledoc """ + Naive Bayes classifier for multivariate Bernoulli models. + Like MultinomialNB, this classifier is suitable for discrete data. The + difference is that while MultinomialNB works with occurrence counts, + BernoulliNB is designed for binary/boolean features. + """ + import Nx.Defn + import Scholar.Shared + + @derive {Nx.Container, + containers: [ + :feature_count, + :class_count, + :class_log_priors, + :feature_log_probability + ]} + defstruct [:feature_count, :class_count, :class_log_priors, :feature_log_probability] + + opts_schema = [ + num_classes: [ + type: :pos_integer, + required: true, + doc: ~S""" + Number of different classes used in training. + """ + ], + alpha: [ + type: {:or, [:float, {:list, :float}]}, + default: 1.0, + doc: ~S""" + Additive (Laplace/Lidstone) smoothing parameter + (set alpha to 0.0 and force_alpha to true, for no smoothing). + """ + ], + force_alpha: [ + type: :boolean, + default: true, + doc: ~S""" + If `false` and alpha is less than 1e-10, it will set alpha to + 1e-10. If `true`, alpha will remain unchanged. This may cause + numerical errors if alpha is too close to 0. + """ + ], + binarize: [ + type: {:or, [:float, {:in, [nil]}]}, + default: 0.0, + doc: ~S""" + Threshold for binarizing (mapping to booleans) of sample features. + If nil, input is presumed to already consist of binary vectors. + """ + ], + fit_priors: [ + type: :boolean, + default: true, + doc: ~S""" + Whether to learn class prior probabilities or not. + If `false`, a uniform prior will be used. + """ + ], + class_priors: [ + type: {:custom, Scholar.Options, :weights, []}, + doc: ~S""" + Prior probabilities of the classes. If specified, the priors are not + adjusted according to the data. + """ + ], + sample_weights: [ + type: {:custom, Scholar.Options, :weights, []}, + doc: ~S""" + List of `num_samples` elements. + A list of 1.0 values is used if none is given. + """ + ] + ] + + @opts_schema NimbleOptions.new!(opts_schema) + + @doc """ + Fits a naive Bayes model. The function assumes that the targets `y` are integers + between 0 and `num_classes` - 1 (inclusive). Otherwise, those samples will not + contribute to `class_count`. + ## Options + #{NimbleOptions.docs(@opts_schema)} + ## Return Values + The function returns a struct with the following parameters: + * `:class_count` - Number of samples encountered for each class during fitting. This + value is weighted by the sample weight when provided. + * `:class_log_priors` - Smoothed empirical log probability for each class. + * `:feature_count` - Number of samples encountered for each (class, feature) + during fitting. This value is weighted by the sample weight when + provided. + * `:feature_log_probability` - Empirical log probability of features + given a class, ``P(x_i|y)``. + ## Examples + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3, binarize: 1.0) + %Scholar.NaiveBayes.Bernoulli{ + feature_count: Nx.tensor( + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 1.0], + [2.0, 2.0, 2.0] + ] + ), + class_count: Nx.tensor( + [1.0, 1.0, 2.0] + ), + class_log_priors: Nx.tensor( + [-1.3862943649291992, -1.3862943649291992, -0.6931471824645996] + ), + feature_log_probability: Nx.tensor( + [ + [-0.40546512603759766, -0.40546512603759766, -0.40546512603759766], + [-1.0986123085021973, -1.0986123085021973, -0.40546512603759766], + [-0.28768205642700195, -0.28768205642700195, -0.28768205642700195] + ] + ) + } + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3, force_alpha: false, alpha: 0.0) + %Scholar.NaiveBayes.Bernoulli{ + feature_count: Nx.tensor( + [ + [1.0, 1.0, 1.0], + [0.0, 1.0, 1.0], + [2.0, 2.0, 2.0] + ] + ), + class_count: Nx.tensor( + [1.0, 1.0, 2.0] + ), + class_log_priors: Nx.tensor( + [-1.3862943649291992, -1.3862943649291992, -0.6931471824645996] + ), + feature_log_probability: Nx.tensor( + [ + [0.0, 0.0, 0.0], + [-23.025850296020508, 0.0, 0.0], + [0.0, 0.0, 0.0] + ] + ) + } + """ + deftransform fit(x, y, opts \\ []) do + if Nx.rank(x) != 2 do + raise ArgumentError, + """ + expected x to have shape {num_samples, num_features}, \ + got tensor with shape: #{inspect(Nx.shape(x))}\ + """ + end + + if Nx.rank(y) != 1 do + raise ArgumentError, + """ + expected y to have shape {num_samples}, \ + got tensor with shape: #{inspect(Nx.shape(y))}\ + """ + end + + {num_samples, num_features} = Nx.shape(x) + + if num_samples != Nx.axis_size(y, 0) do + raise ArgumentError, + """ + expected first dimension of x and y to be of same size, \ + got: #{num_samples} and #{Nx.axis_size(y, 0)}\ + """ + end + + opts = NimbleOptions.validate!(opts, @opts_schema) + type = to_float_type(x) + + x_binarize = + if opts[:binarize] != nil, + do: Scholar.Preprocessing.Binarizer.fit_transform(x, threshold: opts[:binarize]), + else: x + + {alpha, opts} = Keyword.pop!(opts, :alpha) + alpha = Nx.tensor(alpha, type: type) + + if Nx.shape(alpha) not in [{}, {num_features}] do + raise ArgumentError, + """ + when alpha is list it should have length equal to num_features = #{num_features}, \ + got: #{Nx.size(alpha)}\ + """ + end + + num_classes = opts[:num_classes] + + priors_flag = opts[:class_priors] != nil + + {class_priors, opts} = Keyword.pop(opts, :class_priors, :nan) + class_priors = Nx.tensor(class_priors) + + if priors_flag and Nx.size(class_priors) != num_classes do + raise ArgumentError, + """ + expected class_priors to be list of length num_classes = #{num_classes}, \ + got: #{Nx.size(class_priors)}\ + """ + end + + sample_weights_flag = opts[:sample_weights] != nil + + {sample_weights, opts} = Keyword.pop(opts, :sample_weights, :nan) + sample_weights = Nx.tensor(sample_weights, type: type) + + if sample_weights_flag and Nx.shape(sample_weights) != {num_samples} do + raise ArgumentError, + """ + expected sample_weights to be list of length num_samples = #{num_samples}, \ + got: #{Nx.size(sample_weights)}\ + """ + end + + opts = + opts ++ + [ + type: type, + priors_flag: priors_flag, + sample_weights_flag: sample_weights_flag + ] + + fit_n(x_binarize, y, class_priors, sample_weights, alpha, opts) + end + + defnp fit_n(x, y, class_priors, sample_weights, alpha, opts) do + type = opts[:type] + num_samples = Nx.axis_size(x, 0) + + num_classes = opts[:num_classes] + + y_one_hot = Scholar.Preprocessing.OneHotEncoder.fit_transform(y, num_categories: num_classes) + y_one_hot = Nx.select(y_one_hot, Nx.tensor(1, type: type), Nx.tensor(0, type: type)) + + y_weighted = + if opts[:sample_weights_flag], + do: Nx.reshape(sample_weights, {num_samples, 1}) * y_one_hot, + else: y_one_hot + + alpha_lower_bound = Nx.tensor(1.0e-10, type: type) + + alpha = + if opts[:force_alpha], do: alpha, else: Nx.max(alpha, alpha_lower_bound) + + class_count = Nx.sum(y_weighted, axes: [0]) + feature_count = Nx.dot(y_weighted, [0], x, [0]) + + smoothed_feature_count = feature_count + alpha + smoothed_cumulative_count = class_count + alpha * 2 + + feature_log_probability = + Nx.log(smoothed_feature_count) - + Nx.log(Nx.reshape(smoothed_cumulative_count, {num_classes, 1})) + + class_log_priors = + cond do + opts[:priors_flag] -> + Nx.log(class_priors) + + opts[:fit_priors] -> + Nx.log(class_count) - Nx.log(Nx.sum(class_count)) + + true -> + Nx.broadcast(-Nx.log(num_classes), {num_classes}) + end + + %__MODULE__{ + class_count: class_count, + class_log_priors: class_log_priors, + feature_count: feature_count, + feature_log_probability: feature_log_probability + } + end + + @doc """ + Perform classification on an array of test vectors `x` using `model`. + You need to add sorted classes from the training data as the second argument. + ## Examples + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]), Nx.tensor([0, 1, 2])) + #Nx.Tensor< + s64[2] + [2, 2] + > + """ + defn predict(%__MODULE__{} = model, x, classes) do + check_dim(x, Nx.axis_size(model.feature_count, 1)) + + if Nx.rank(classes) != 1 do + raise ArgumentError, + """ + expected classes to be a 1D tensor, \ + got tensor with shape: #{inspect(Nx.shape(classes))}\ + """ + end + + if Nx.axis_size(classes, 0) != Nx.axis_size(model.class_count, 0) do + raise ArgumentError, + """ + expected classes to have same size as the number of classes in the model, \ + got: #{Nx.axis_size(classes, 0)} for classes and #{Nx.axis_size(model.class_count, 0)} for model\ + """ + end + + jll = joint_log_likelihood(model, x) + classes[Nx.argmax(jll, axis: 1)] + end + + @doc """ + Return log-probability estimates for the test vector `x` using `model`. + ## Examples + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict_log_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]])) + #Nx.Tensor< + f32[2][3] + [ + [-4.704780578613281, -12.329399108886719, -0.009097099304199219], + [-8.750494003295898, -19.147701263427734, -1.583099365234375e-4] + ] + > + """ + defn predict_log_probability(%__MODULE__{} = model, x) do + check_dim(x, Nx.axis_size(model.feature_count, 1)) + jll = joint_log_likelihood(model, x) + + log_proba_x = + jll + |> Nx.logsumexp(axes: [1]) + |> Nx.new_axis(1) + |> Nx.broadcast(jll) + + jll - log_proba_x + end + + @doc """ + Return probability estimates for the test vector `x` using `model`. + ## Examples + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]])) + #Nx.Tensor< + f32[2][3] + [ + [0.00905190035700798, 4.4198750401847064e-6, 0.9909441471099854], + [1.5838305989746004e-4, 4.833469624543341e-9, 0.9998416900634766] + ] + > + """ + defn predict_probability(%__MODULE__{} = model, x) do + Nx.exp(predict_log_probability(model, x)) + end + + @doc """ + Return joint log probability estimates for the test vector `x` using `model`. + ## Examples + iex> x = Nx.iota({4, 3}) + iex> y = Nx.tensor([1, 2, 0, 2]) + iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) + iex> Scholar.NaiveBayes.Bernoulli.predict_joint_log_probability(model, Nx.tensor([[6, 2, 4], [8, 5, 9]])) + #Nx.Tensor< + f32[2][3] + [ + [3.6356334686279297, -3.988985061645508, 8.331316947937012], + [10.56710433959961, 0.16989731788635254, 19.317440032958984] + ] + > + """ + defn predict_joint_log_probability(%__MODULE__{} = model, x) do + check_dim(x, Nx.axis_size(model.feature_count, 1)) + joint_log_likelihood(model, x) + end + + defnp check_dim(x, dim) do + num_features = Nx.axis_size(x, 1) + + if num_features != dim do + raise ArgumentError, + """ + expected x to have same second dimension as data used for fitting model, \ + got: #{num_features} for x and #{dim} for training data\ + """ + end + end + + defnp joint_log_likelihood( + %__MODULE__{ + feature_log_probability: feature_log_probability, + class_log_priors: class_log_priors + }, + x + ) do + neg_prob = Nx.log(1 - Nx.exp(feature_log_probability)) + jll = Nx.dot(x, [1], feature_log_probability - neg_prob, [1]) + jll + class_log_priors + Nx.sum(neg_prob, axes: [1]) + end +end diff --git a/lib/scholar/preprocessing/binarizer.ex b/lib/scholar/preprocessing/binarizer.ex new file mode 100644 index 00000000..ec468cb3 --- /dev/null +++ b/lib/scholar/preprocessing/binarizer.ex @@ -0,0 +1,56 @@ +defmodule Scholar.Preprocessing.Binarizer do + @moduledoc """ + Binarize data according to a threshold. + """ + import Nx.Defn + + binarize_schema = [ + threshold: [ + type: :float, + default: 0.0, + doc: """ + Feature values below or equal to this are replaced by 0, above it by 1. + Threshold may not be less than 0 for operations on sparse matrices. + """ + ] + ] + + @binarize_schema NimbleOptions.new!(binarize_schema) + + @doc """ + Values greater than the threshold map to 1, while values less than + or equal to the threshold map to 0. With the default threshold of 0, + only positive values map to 1. + ## Options + #{NimbleOptions.docs(@binarize_schema)} + ## Examples + iex> t = Nx.tensor([[0, 0, 0], [3, 4, 5], [-2, 4, 3]]) + iex> Scholar.Preprocessing.Binarizer.fit_transform(t, threshold: 3.0) + #Nx.Tensor< + s64[3][3] + [ + [0, 0, 0], + [0, 1, 1], + [0, 1, 0] + ] + > + iex> t = Nx.tensor([[0, 0, 0], [3, 4, 5], [-2, 4, 3]]) + iex> Scholar.Preprocessing.Binarizer.fit_transform(t,threshold: 0.4) + #Nx.Tensor< + s64[3][3] + [ + [0, 0, 0], + [1, 1, 1], + [0, 1, 1] + ] + > + """ + deftransform fit_transform(tensor, opts \\ []) do + binarize_n(tensor, NimbleOptions.validate!(opts, @binarize_schema)) + end + + defnp binarize_n(tensor, opts) do + threshold = opts[:threshold] + Nx.select(Nx.greater(tensor, threshold), 1, 0) + end +end diff --git a/test/scholar/naive_bayes/bernoulli_test.exs b/test/scholar/naive_bayes/bernoulli_test.exs new file mode 100644 index 00000000..77adaad5 --- /dev/null +++ b/test/scholar/naive_bayes/bernoulli_test.exs @@ -0,0 +1,173 @@ +defmodule Scholar.NaiveBayes.BernoulliTest do + use Scholar.Case, async: true + alias Scholar.NaiveBayes.Bernoulli + doctest Bernoulli + + describe "fit" do + test "binary y" do + x = Nx.iota({5, 6}) + x = Scholar.Preprocessing.Binarizer.fit_transform(x) + y = Nx.tensor([1, 0, 1, 0, 1]) + + model = Bernoulli.fit(x, y, num_classes: 2, binarize: nil) + + assert model.feature_count == + Nx.tensor([ + [2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [2.0, 3.0, 3.0, 3.0, 3.0, 3.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.51082562, -0.22314355, -0.22314355, -0.22314355, -0.22314355, -0.22314355] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([ + -0.91629073, + -0.51082562 + ]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 3.0]) + end + + test ":alpha set to a different value" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 2, 6, 3, 1]) + + model = Bernoulli.fit(x, y, num_classes: 4, alpha: 0.4) + + assert model.feature_count == + Nx.tensor([ + [1.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.69314718, -0.15415068, -0.15415068, -0.15415068, -0.15415068, -0.15415068], + [-0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443], + [-0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443], + [-0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443, -0.25131443] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([-0.91629073, -1.60943791, -1.60943791, -1.60943791]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + assert_all_close(expected_class_log_priors, model.class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 1.0, 1.0, 1.0]) + end + + test ":fit_priors set to false" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 0, 1, 0, 1]) + + model = Bernoulli.fit(x, y, num_classes: 2, fit_priors: false) + + assert model.feature_count == + Nx.tensor([ + [2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [2.0, 3.0, 3.0, 3.0, 3.0, 3.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.51082562, -0.22314355, -0.22314355, -0.22314355, -0.22314355, -0.22314355] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([-0.69314718, -0.69314718]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + assert_all_close(expected_class_log_priors, model.class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 3.0]) + end + + # + test "fit test - :class_priors are set as a list" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 2, 3, 2, 1]) + + model = Bernoulli.fit(x, y, num_classes: 3, class_priors: [0.3, 0.4, 0.3]) + + assert model.feature_count == + Nx.tensor([ + [1.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [2.0, 2.0, 2.0, 2.0, 2.0, 2.0], + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] + ]) + + expected_feature_log_probability = + Nx.tensor([ + [-0.69314718, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207, -0.28768207], + [-0.40546511, -0.40546511, -0.40546511, -0.40546511, -0.40546511, -0.40546511] + ]) + + assert_all_close(model.feature_log_probability, expected_feature_log_probability) + + expected_class_log_priors = + Nx.tensor([-1.2039728, -0.91629073, -1.2039728]) + + assert_all_close(model.class_log_priors, expected_class_log_priors) + assert_all_close(expected_class_log_priors, model.class_log_priors) + + assert model.class_count == Nx.tensor([2.0, 2.0, 1.0]) + end + + test "error handling for wrong input shapes" do + assert_raise ArgumentError, + "expected x to have shape {num_samples, num_features}, got tensor with shape: {4}", + fn -> + Bernoulli.fit( + Nx.tensor([1, 2, 3, 4]), + Nx.tensor([1, 0, 1, 0]), + num_classes: 2 + ) + end + + assert_raise ArgumentError, + "expected y to have shape {num_samples}, got tensor with shape: {1, 4}", + fn -> + Bernoulli.fit( + Nx.tensor([[1, 2, 3, 4]]), + Nx.tensor([[1, 0, 1, 0]]), + num_classes: 2 + ) + end + end + end + + describe "predict" do + test "predicts classes correctly for new data" do + x = Nx.iota({5, 6}) + y = Nx.tensor([1, 2, 3, 4, 5]) + + jit_model = Nx.Defn.jit(&Bernoulli.fit/3) + model = jit_model.(x, y, num_classes: 5) + + x_test = Nx.tensor([[1, 2, 3, 4, 5, 6], [0, 0, 0, 0, 0, 0]]) + + jit_predict = Nx.Defn.jit(&Bernoulli.predict/3) + predictions = jit_predict.(model, x_test, Nx.tensor([1, 2, 3, 4, 5])) + + expected_predictions = Nx.tensor([2, 1]) + assert predictions == expected_predictions + end + end +end diff --git a/test/scholar/preprocessing/binarizer_test.exs b/test/scholar/preprocessing/binarizer_test.exs new file mode 100644 index 00000000..8bf33142 --- /dev/null +++ b/test/scholar/preprocessing/binarizer_test.exs @@ -0,0 +1,34 @@ +defmodule Scholar.Preprocessing.BinarizerTest do + use Scholar.Case, async: true + alias Scholar.Preprocessing.Binarizer + doctest Binarizer + + describe "binarization" do + test "binarize with positive threshold" do + tensor = Nx.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [-2.0, -1.0, 0.0]]) + + jit_binarizer = Nx.Defn.jit(&Binarizer.fit_transform/2) + + result = jit_binarizer.(tensor, threshold: 2.0) + + assert Nx.to_flat_list(result) == [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0] + end + + test "binarize values with default threshold" do + tensor = Nx.tensor([[0.0, -1.0, 2.0], [3.0, 4.0, -5.0], [-2.0, 1.0, 0.0]]) + + result = Binarizer.fit_transform(tensor) + + assert Nx.to_flat_list(result) == [0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0] + end + + test "binarize with threshold less than 0" do + tensor = Nx.tensor([[0.0, 0.5, -0.5], [-0.1, -0.2, -0.3]]) + jit_binarizer = Nx.Defn.jit(&Binarizer.fit_transform/2) + + result = jit_binarizer.(tensor, threshold: -0.2) + + assert Nx.to_flat_list(result) == [1.0, 1.0, 0.0, 1.0, 0.0, 0.0] + end + end +end From 648ea2a93f401721e38035c91583a0706b259015 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 30 Oct 2024 11:51:57 +0100 Subject: [PATCH 2/7] refactor and move binarize to defn --- lib/scholar/naive_bayes/bernoulli.ex | 37 +++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex index f6da0e44..da49979e 100644 --- a/lib/scholar/naive_bayes/bernoulli.ex +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -80,19 +80,29 @@ defmodule Scholar.NaiveBayes.Bernoulli do Fits a naive Bayes model. The function assumes that the targets `y` are integers between 0 and `num_classes` - 1 (inclusive). Otherwise, those samples will not contribute to `class_count`. + ## Options + #{NimbleOptions.docs(@opts_schema)} + ## Return Values + The function returns a struct with the following parameters: + * `:class_count` - Number of samples encountered for each class during fitting. This value is weighted by the sample weight when provided. + * `:class_log_priors` - Smoothed empirical log probability for each class. + * `:feature_count` - Number of samples encountered for each (class, feature) during fitting. This value is weighted by the sample weight when provided. + * `:feature_log_probability` - Empirical log probability of features given a class, ``P(x_i|y)``. + ## Examples + iex> x = Nx.iota({4, 3}) iex> y = Nx.tensor([1, 2, 0, 2]) iex> Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3, binarize: 1.0) @@ -118,6 +128,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do ] ) } + iex> x = Nx.iota({4, 3}) iex> y = Nx.tensor([1, 2, 0, 2]) iex> Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3, force_alpha: false, alpha: 0.0) @@ -144,6 +155,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do ) } """ + deftransform fit(x, y, opts \\ []) do if Nx.rank(x) != 2 do raise ArgumentError, @@ -174,11 +186,6 @@ defmodule Scholar.NaiveBayes.Bernoulli do opts = NimbleOptions.validate!(opts, @opts_schema) type = to_float_type(x) - x_binarize = - if opts[:binarize] != nil, - do: Scholar.Preprocessing.Binarizer.fit_transform(x, threshold: opts[:binarize]), - else: x - {alpha, opts} = Keyword.pop!(opts, :alpha) alpha = Nx.tensor(alpha, type: type) @@ -226,7 +233,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do sample_weights_flag: sample_weights_flag ] - fit_n(x_binarize, y, class_priors, sample_weights, alpha, opts) + fit_n(x, y, class_priors, sample_weights, alpha, opts) end defnp fit_n(x, y, class_priors, sample_weights, alpha, opts) do @@ -235,6 +242,12 @@ defmodule Scholar.NaiveBayes.Bernoulli do num_classes = opts[:num_classes] + x = + case opts[:binarize] do + nil -> x + binarize ->Scholar.Preprocessing.Binarizer.fit_transform(x, threshold: binarize) + end + y_one_hot = Scholar.Preprocessing.OneHotEncoder.fit_transform(y, num_categories: num_classes) y_one_hot = Nx.select(y_one_hot, Nx.tensor(1, type: type), Nx.tensor(0, type: type)) @@ -281,7 +294,9 @@ defmodule Scholar.NaiveBayes.Bernoulli do @doc """ Perform classification on an array of test vectors `x` using `model`. You need to add sorted classes from the training data as the second argument. + ## Examples + iex> x = Nx.iota({4, 3}) iex> y = Nx.tensor([1, 2, 0, 2]) iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) @@ -291,6 +306,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do [2, 2] > """ + defn predict(%__MODULE__{} = model, x, classes) do check_dim(x, Nx.axis_size(model.feature_count, 1)) @@ -316,7 +332,9 @@ defmodule Scholar.NaiveBayes.Bernoulli do @doc """ Return log-probability estimates for the test vector `x` using `model`. + ## Examples + iex> x = Nx.iota({4, 3}) iex> y = Nx.tensor([1, 2, 0, 2]) iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) @@ -329,6 +347,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do ] > """ + defn predict_log_probability(%__MODULE__{} = model, x) do check_dim(x, Nx.axis_size(model.feature_count, 1)) jll = joint_log_likelihood(model, x) @@ -344,7 +363,9 @@ defmodule Scholar.NaiveBayes.Bernoulli do @doc """ Return probability estimates for the test vector `x` using `model`. + ## Examples + iex> x = Nx.iota({4, 3}) iex> y = Nx.tensor([1, 2, 0, 2]) iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) @@ -357,13 +378,16 @@ defmodule Scholar.NaiveBayes.Bernoulli do ] > """ + defn predict_probability(%__MODULE__{} = model, x) do Nx.exp(predict_log_probability(model, x)) end @doc """ Return joint log probability estimates for the test vector `x` using `model`. + ## Examples + iex> x = Nx.iota({4, 3}) iex> y = Nx.tensor([1, 2, 0, 2]) iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) @@ -376,6 +400,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do ] > """ + defn predict_joint_log_probability(%__MODULE__{} = model, x) do check_dim(x, Nx.axis_size(model.feature_count, 1)) joint_log_likelihood(model, x) From 7087beb03139c5ed4e554ec9e3f16db81e5cd5d9 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 30 Oct 2024 11:55:28 +0100 Subject: [PATCH 3/7] apply mix format --- lib/scholar/naive_bayes/bernoulli.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex index da49979e..40a0e9a4 100644 --- a/lib/scholar/naive_bayes/bernoulli.ex +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -245,7 +245,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do x = case opts[:binarize] do nil -> x - binarize ->Scholar.Preprocessing.Binarizer.fit_transform(x, threshold: binarize) + binarize -> Scholar.Preprocessing.Binarizer.fit_transform(x, threshold: binarize) end y_one_hot = Scholar.Preprocessing.OneHotEncoder.fit_transform(y, num_categories: num_classes) From 572e6c6efd4d677038536b58acf2dd92550472cd Mon Sep 17 00:00:00 2001 From: srzeszut Date: Mon, 25 Nov 2024 16:06:25 +0100 Subject: [PATCH 4/7] siplified binarizier --- lib/scholar/preprocessing/binarizer.ex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/scholar/preprocessing/binarizer.ex b/lib/scholar/preprocessing/binarizer.ex index ec468cb3..889ce1d5 100644 --- a/lib/scholar/preprocessing/binarizer.ex +++ b/lib/scholar/preprocessing/binarizer.ex @@ -27,7 +27,7 @@ defmodule Scholar.Preprocessing.Binarizer do iex> t = Nx.tensor([[0, 0, 0], [3, 4, 5], [-2, 4, 3]]) iex> Scholar.Preprocessing.Binarizer.fit_transform(t, threshold: 3.0) #Nx.Tensor< - s64[3][3] + u8[3][3] [ [0, 0, 0], [0, 1, 1], @@ -37,7 +37,7 @@ defmodule Scholar.Preprocessing.Binarizer do iex> t = Nx.tensor([[0, 0, 0], [3, 4, 5], [-2, 4, 3]]) iex> Scholar.Preprocessing.Binarizer.fit_transform(t,threshold: 0.4) #Nx.Tensor< - s64[3][3] + u8[3][3] [ [0, 0, 0], [1, 1, 1], @@ -51,6 +51,6 @@ defmodule Scholar.Preprocessing.Binarizer do defnp binarize_n(tensor, opts) do threshold = opts[:threshold] - Nx.select(Nx.greater(tensor, threshold), 1, 0) + tensor > threshold end end From 2a28cd4cafddd04ae9b98d6b4cd03f77a27382ff Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 27 Nov 2024 14:03:28 +0100 Subject: [PATCH 5/7] apply suggestions --- lib/scholar/naive_bayes/bernoulli.ex | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex index 40a0e9a4..eba3cdc4 100644 --- a/lib/scholar/naive_bayes/bernoulli.ex +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -355,8 +355,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do log_proba_x = jll |> Nx.logsumexp(axes: [1]) - |> Nx.new_axis(1) - |> Nx.broadcast(jll) + |> Nx.reshape({Nx.axis_size(jll, 0), 1}) jll - log_proba_x end From f7b89bc8e3b4696e5703953bf997f1bff508d1c7 Mon Sep 17 00:00:00 2001 From: srzeszut Date: Wed, 27 Nov 2024 15:04:15 +0100 Subject: [PATCH 6/7] fix doctests --- lib/scholar/naive_bayes/bernoulli.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex index eba3cdc4..dd134c0c 100644 --- a/lib/scholar/naive_bayes/bernoulli.ex +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -302,7 +302,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do iex> model = Scholar.NaiveBayes.Bernoulli.fit(x, y, num_classes: 3) iex> Scholar.NaiveBayes.Bernoulli.predict(model, Nx.tensor([[6, 2, 4], [8, 5, 9]]), Nx.tensor([0, 1, 2])) #Nx.Tensor< - s64[2] + s32[2] [2, 2] > """ From 8b03416144fe6ac73c23d65a6446b006bdb72ed9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Wed, 27 Nov 2024 15:17:06 +0100 Subject: [PATCH 7/7] Update lib/scholar/naive_bayes/bernoulli.ex --- lib/scholar/naive_bayes/bernoulli.ex | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/scholar/naive_bayes/bernoulli.ex b/lib/scholar/naive_bayes/bernoulli.ex index dd134c0c..cd9cc505 100644 --- a/lib/scholar/naive_bayes/bernoulli.ex +++ b/lib/scholar/naive_bayes/bernoulli.ex @@ -1,6 +1,7 @@ defmodule Scholar.NaiveBayes.Bernoulli do @moduledoc """ Naive Bayes classifier for multivariate Bernoulli models. + Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.